Spaces:

legacies
/

doctr

Runtime error

App Files Files Community

legacies commited on May 29, 2024

Commit

0e17e4e

1 Parent(s): 4ce5fde

initial files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +140 -0
.pre-commit-config.yaml +23 -0
CODE_OF_CONDUCT.md +128 -0
CONTRIBUTING.md +92 -0
Dockerfile +75 -0
LICENSE +201 -0
Makefile +33 -0
README.md +384 -12
backend/pytorch.py +93 -0
doctr/__init__.py +3 -0
doctr/datasets/__init__.py +26 -0
doctr/datasets/cord.py +121 -0
doctr/datasets/datasets/__init__.py +6 -0
doctr/datasets/datasets/base.py +132 -0
doctr/datasets/datasets/pytorch.py +59 -0
doctr/datasets/datasets/tensorflow.py +59 -0
doctr/datasets/detection.py +98 -0
doctr/datasets/doc_artefacts.py +82 -0
doctr/datasets/funsd.py +112 -0
doctr/datasets/generator/__init__.py +6 -0
doctr/datasets/generator/base.py +155 -0
doctr/datasets/generator/pytorch.py +54 -0
doctr/datasets/generator/tensorflow.py +60 -0
doctr/datasets/ic03.py +126 -0
doctr/datasets/ic13.py +99 -0
doctr/datasets/iiit5k.py +103 -0
doctr/datasets/iiithws.py +75 -0
doctr/datasets/imgur5k.py +147 -0
doctr/datasets/loader.py +102 -0
doctr/datasets/mjsynth.py +106 -0
doctr/datasets/ocr.py +71 -0
doctr/datasets/orientation.py +40 -0
doctr/datasets/recognition.py +56 -0
doctr/datasets/sroie.py +103 -0
doctr/datasets/svhn.py +131 -0
doctr/datasets/svt.py +117 -0
doctr/datasets/synthtext.py +128 -0
doctr/datasets/utils.py +216 -0
doctr/datasets/vocabs.py +71 -0
doctr/datasets/wildreceipt.py +111 -0
doctr/file_utils.py +92 -0
doctr/io/__init__.py +5 -0
doctr/io/elements.py +621 -0
doctr/io/html.py +28 -0
doctr/io/image/__init__.py +8 -0
doctr/io/image/base.py +56 -0
doctr/io/image/pytorch.py +109 -0
doctr/io/image/tensorflow.py +110 -0
doctr/io/pdf.py +42 -0
doctr/io/reader.py +79 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,140 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Temp files
+doctr/version.py
+logs/
+wandb/
+.idea/
+# Checkpoints
+*.pt
+*.pb
+*.index

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-ast
+      - id: check-yaml
+        exclude: .conda
+      - id: check-toml
+      - id: check-json
+      - id: check-added-large-files
+        exclude: docs/images/
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: debug-statements
+      - id: check-merge-conflict
+      - id: no-commit-to-branch
+        args: ['--branch', 'main']
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.2
+    hooks:
+      - id: ruff
+        args: [ --fix ]
+      - id: ruff-format

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+contact@mindee.com.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# Contributing to docTR
+Everything you need to know to contribute efficiently to the project.
+## Codebase structure
+- [doctr](https://github.com/mindee/doctr/blob/main/doctr) - The package codebase
+- [tests](https://github.com/mindee/doctr/blob/main/tests) - Python unit tests
+- [docs](https://github.com/mindee/doctr/blob/main/docs) - Library documentation building
+- [scripts](https://github.com/mindee/doctr/blob/main/scripts) - Example scripts
+- [references](https://github.com/mindee/doctr/blob/main/references) - Reference training scripts
+- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities
+- [api](https://github.com/mindee/doctr/blob/main/api) - A minimal template to deploy a REST API with docTR
+## Continuous Integration
+This project uses the following integrations to ensure proper codebase maintenance:
+- [Github Worklow](https://help.github.com/en/actions/configuring-and-managing-workflows/configuring-a-workflow) - run jobs for package build and coverage
+- [Codecov](https://codecov.io/) - reports back coverage results
+As a contributor, you will only have to ensure coverage of your code by adding appropriate unit testing of your code.
+## Feedback
+### Feature requests & bug report
+Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues).
+First, check whether the topic wasn't already covered in an open / closed issue. If not, feel free to open a new one! When doing so, use issue templates whenever possible and provide enough information for other contributors to jump in.
+### Questions
+If you are wondering how to do something with docTR, or a more general question, you should consider checking out Github [discussions](https://github.com/mindee/doctr/discussions). See it as a Q&A forum, or the docTR-specific StackOverflow!
+## Developing docTR
+### Developer mode installation
+Install all additional dependencies with the following command:
+```shell
+python -m pip install --upgrade pip
+pip install -e .[dev]
+pre-commit install
+```
+### Commits
+- **Code**: ensure to provide docstrings to your Python code. In doing so, please follow [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) so it can ease the process of documentation later.
+- **Commit message**: please follow [Udacity guide](http://udacity.github.io/git-styleguide/)
+### Unit tests
+In order to run the same unit tests as the CI workflows, you can run unittests locally:
+```shell
+make test
+```
+### Code quality
+To run all quality checks together
+```shell
+make quality
+```
+#### Code style verification
+To run all style checks together
+```shell
+make style
+```
+### Modifying the documentation
+The current documentation is built using `sphinx` thanks to our CI.
+You can build the documentation locally:
+```shell
+make docs-single-version
+```
+Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the `_build` directory. Additionally, you may need to clear your web browser's cache to see the modifications.
+You can now open your local version of the documentation located at `docs/_build/index.html` in your browser
+## Let's connect
+Should you wish to connect somewhere else than on GitHub, feel free to join us on [Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-uzgmljfl-MotFVfH~IdEZxjp~0zldww), where you will find a `#doctr` channel!

Dockerfile ADDED Viewed

	@@ -0,0 +1,75 @@

+FROM ubuntu:22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG=C.UTF-8
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+ARG SYSTEM=gpu
+# Enroll NVIDIA GPG public key and install CUDA
+RUN if [ "$SYSTEM" = "gpu" ]; then \
+    apt-get update && \
+    apt-get install -y gnupg ca-certificates wget && \
+    # - Install Nvidia repo keys
+    # - See: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#network-repo-installation-for-ubuntu
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    apt-get update && apt-get install -y --no-install-recommends \
+    cuda-command-line-tools-11-8 \
+    cuda-cudart-dev-11-8 \
+    cuda-nvcc-11-8 \
+    cuda-cupti-11-8 \
+    cuda-nvprune-11-8 \
+    cuda-libraries-11-8 \
+    cuda-nvrtc-11-8 \
+    libcufft-11-8 \
+    libcurand-11-8 \
+    libcusolver-11-8 \
+    libcusparse-11-8 \
+    libcublas-11-8 \
+    # - CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
+    libcudnn8=8.6.0.163-1+cuda11.8 \
+    libnvinfer-plugin8=8.6.1.6-1+cuda11.8 \
+    libnvinfer8=8.6.1.6-1+cuda11.8; \
+fi
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    # - Other packages
+    build-essential \
+    pkg-config \
+    curl \
+    wget \
+    software-properties-common \
+    unzip \
+    git \
+    # - Packages to build Python
+    tar make gcc zlib1g-dev libffi-dev libssl-dev liblzma-dev libbz2-dev libsqlite3-dev \
+    # - Packages for docTR
+    libgl1-mesa-dev libsm6 libxext6 libxrender-dev libpangocairo-1.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+fi
+# Install Python
+ARG PYTHON_VERSION=3.10.13
+RUN wget http://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
+    tar -zxf Python-$PYTHON_VERSION.tgz && \
+    cd Python-$PYTHON_VERSION && \
+    mkdir /opt/python/ && \
+    ./configure --prefix=/opt/python && \
+    make && \
+    make install && \
+    cd .. && \
+    rm Python-$PYTHON_VERSION.tgz && \
+    rm -r Python-$PYTHON_VERSION
+ENV PATH=/opt/python/bin:$PATH
+# Install docTR
+ARG FRAMEWORK=tf
+ARG DOCTR_REPO='mindee/doctr'
+ARG DOCTR_VERSION=main
+RUN pip3 install -U pip setuptools wheel && \
+    pip3 install "python-doctr[$FRAMEWORK]@git+https://github.com/$DOCTR_REPO.git@$DOCTR_VERSION"

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2022 Mindee
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

Makefile ADDED Viewed

	@@ -0,0 +1,33 @@

+.PHONY: quality style test test-common test-tf test-torch docs-single-version docs
+# this target runs checks on all files
+quality:
+	ruff check .
+	mypy doctr/
+# this target runs checks on all files and potentially modifies some of them
+style:
+	ruff check --fix .
+	ruff format .
+# Run tests for the library
+test:
+	coverage run -m pytest tests/common/
+	USE_TF='1' coverage run -m pytest tests/tensorflow/
+	USE_TORCH='1' coverage run -m pytest tests/pytorch/
+test-common:
+	coverage run -m pytest tests/common/
+test-tf:
+	USE_TF='1' coverage run -m pytest tests/tensorflow/
+test-torch:
+	USE_TORCH='1' coverage run -m pytest tests/pytorch/
+# Check that docs can build
+docs-single-version:
+	sphinx-build docs/source docs/_build -a
+# Check that docs can build
+docs:
+	cd docs && bash build.sh

README.md CHANGED Viewed

@@ -1,12 +1,384 @@
----
-title: Doctr
-emoji: 📚
-colorFrom: yellow
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.35.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<p align="center">
+  <img src="https://github.com/mindee/doctr/raw/main/docs/images/Logo_doctr.gif" width="40%">
+</p>
+[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v0.8.1-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb)
+**Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch**
+What you can expect from this repository:
+- efficient ways to parse textual information (localize and identify each word) from your documents
+- guidance on how to integrate this in your current architecture
+![OCR_example](https://github.com/mindee/doctr/raw/main/docs/images/ocr.png)
+## Quick Tour
+### Getting your pretrained model
+End-to-End OCR is achieved in docTR using a two-stage approach: text detection (localizing words), then text recognition (identify all characters in the word).
+As such, you can select the architecture used for [text detection](https://mindee.github.io/doctr/latest/modules/models.html#doctr-models-detection), and the one for [text recognition](https://mindee.github.io/doctr/latest//modules/models.html#doctr-models-recognition) from the list of available implementations.
+```python
+from doctr.models import ocr_predictor
+model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+```
+### Reading files
+Documents can be interpreted from PDF or images:
+```python
+from doctr.io import DocumentFile
+# PDF
+pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
+# Image
+single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
+# Webpage
+webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
+# Multiple page images
+multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
+```
+### Putting it together
+Let's use the default pretrained model for an example:
+```python
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+model = ocr_predictor(pretrained=True)
+# PDF
+doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
+# Analyze
+result = model(doc)
+```
+### Dealing with rotated documents
+Should you use docTR on documents that include rotated pages, or pages with multiple box orientations,
+you have multiple options to handle it:
+- If you only use straight document pages with straight words (horizontal, same reading direction),
+consider passing `assume_straight_boxes=True` to the ocr_predictor. It will directly fit straight boxes
+on your page and return straight boxes, which makes it the fastest option.
+- If you want the predictor to output straight boxes (no matter the orientation of your pages, the final localizations
+will be converted to straight boxes), you need to pass `export_as_straight_boxes=True` in the predictor. Otherwise, if `assume_straight_pages=False`, it will return rotated bounding boxes (potentially with an angle of 0°).
+If both options are set to False, the predictor will always fit and return rotated boxes.
+To interpret your model's predictions, you can visualize them interactively as follows:
+```python
+result.show()
+```
+![Visualization sample](https://github.com/mindee/doctr/raw/main/docs/images/doctr_example_script.gif)
+Or even rebuild the original document from its predictions:
+```python
+import matplotlib.pyplot as plt
+synthetic_pages = result.synthesize()
+plt.imshow(synthetic_pages[0]); plt.axis('off'); plt.show()
+```
+![Synthesis sample](https://github.com/mindee/doctr/raw/main/docs/images/synthesized_sample.png)
+The `ocr_predictor` returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`).
+To get a better understanding of our document model, check our [documentation](https://mindee.github.io/doctr/modules/io.html#document-structure):
+You can also export them as a nested dict, more appropriate for JSON format:
+```python
+json_output = result.export()
+```
+### Use the KIE predictor
+The KIE predictor is a more flexible predictor compared to OCR as your detection model can detect multiple classes in a document. For example, you can have a detection model to detect just dates and addresses in a document.
+The KIE predictor makes it possible to use detector with multiple classes with a recognition model and to have the whole pipeline already setup for you.
+```python
+from doctr.io import DocumentFile
+from doctr.models import kie_predictor
+# Model
+model = kie_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+# PDF
+doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
+# Analyze
+result = model(doc)
+predictions = result.pages[0].predictions
+for class_name in predictions.keys():
+    list_predictions = predictions[class_name]
+    for prediction in list_predictions:
+        print(f"Prediction for {class_name}: {prediction}")
+```
+The KIE predictor results per page are in a dictionary format with each key representing a class name and it's value are the predictions for that class.
+### If you are looking for support from the Mindee team
+[![Bad OCR test detection image asking the developer if they need help](https://github.com/mindee/doctr/raw/main/docs/images/doctr-need-help.png)](https://mindee.com/product/doctr)
+## Installation
+### Prerequisites
+Python 3.9 (or higher) and [pip](https://pip.pypa.io/en/stable/) are required to install docTR.
+Since we use [weasyprint](https://weasyprint.org/), you will need extra dependencies if you are not running Linux.
+For MacOS users, you can install them as follows:
+```shell
+brew install cairo pango gdk-pixbuf libffi
+```
+For Windows users, those dependencies are included in GTK. You can find the latest installer over [here](https://github.com/tschoonj/GTK-for-Windows-Runtime-Environment-Installer/releases).
+### Latest release
+You can then install the latest release of the package using [pypi](https://pypi.org/project/python-doctr/) as follows:
+```shell
+pip install python-doctr
+```
+> :warning: Please note that the basic installation is not standalone, as it does not provide a deep learning framework, which is required for the package to run.
+We try to keep framework-specific dependencies to a minimum. You can install framework-specific builds as follows:
+```shell
+# for TensorFlow
+pip install "python-doctr[tf]"
+# for PyTorch
+pip install "python-doctr[torch]"
+```
+For MacBooks with M1 chip, you will need some additional packages or specific versions:
+- TensorFlow 2: [metal plugin](https://developer.apple.com/metal/tensorflow-plugin/)
+- PyTorch: [version >= 1.12.0](https://pytorch.org/get-started/locally/#start-locally)
+### Developer mode
+Alternatively, you can install it from source, which will require you to install [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
+First clone the project repository:
+```shell
+git clone https://github.com/mindee/doctr.git
+pip install -e doctr/.
+```
+Again, if you prefer to avoid the risk of missing dependencies, you can install the TensorFlow or the PyTorch build:
+```shell
+# for TensorFlow
+pip install -e doctr/.[tf]
+# for PyTorch
+pip install -e doctr/.[torch]
+```
+## Models architectures
+Credits where it's due: this repository is implementing, among others, architectures from published research papers.
+### Text Detection
+- DBNet: [Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/pdf/1911.08947.pdf).
+- LinkNet: [LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation](https://arxiv.org/pdf/1707.03718.pdf)
+- FAST: [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/pdf/2111.02394.pdf)
+### Text Recognition
+- CRNN: [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/pdf/1507.05717.pdf).
+- SAR: [Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/pdf/1811.00751.pdf).
+- MASTER: [MASTER: Multi-Aspect Non-local Network for Scene Text Recognition](https://arxiv.org/pdf/1910.02562.pdf).
+- ViTSTR: [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/pdf/2105.08582.pdf).
+- PARSeq: [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/pdf/2207.06966).
+## More goodies
+### Documentation
+The full package documentation is available [here](https://mindee.github.io/doctr/) for detailed specifications.
+### Demo app
+A minimal demo app is provided for you to play with our end-to-end OCR models!
+![Demo app](https://github.com/mindee/doctr/raw/main/docs/images/demo_update.png)
+#### Live demo
+Courtesy of :hugs: [Hugging Face](https://huggingface.co/) :hugs:, docTR has now a fully deployed version available on [Spaces](https://huggingface.co/spaces)!
+Check it out [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr)
+#### Running it locally
+If you prefer to use it locally, there is an extra dependency ([Streamlit](https://streamlit.io/)) that is required.
+##### Tensorflow version
+```shell
+pip install -r demo/tf-requirements.txt
+```
+Then run your app in your default browser with:
+```shell
+USE_TF=1 streamlit run demo/app.py
+```
+##### PyTorch version
+```shell
+pip install -r demo/pt-requirements.txt
+```
+Then run your app in your default browser with:
+```shell
+USE_TORCH=1 streamlit run demo/app.py
+```
+#### TensorFlow.js
+Instead of having your demo actually running Python, you would prefer to run everything in your web browser?
+Check out our [TensorFlow.js demo](https://github.com/mindee/doctr-tfjs-demo) to get started!
+![TFJS demo](https://github.com/mindee/doctr/raw/main/docs/images/demo_illustration_mini.png)
+### Docker container
+[We offer Docker container support for easy testing and deployment](https://github.com/mindee/doctr/pkgs/container/doctr).
+#### Using GPU with docTR Docker Images
+The docTR Docker images are GPU-ready and based on CUDA `11.8`.
+However, to use GPU support with these Docker images, please ensure that Docker is configured to use your GPU.
+To verify and configure GPU support for Docker, please follow the instructions provided in the [NVIDIA Container Toolkit Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+Once Docker is configured to use GPUs, you can run docTR Docker containers with GPU support:
+```shell
+docker run -it --gpus all ghcr.io/mindee/doctr:tf-py3.8.18-gpu-2023-09 bash
+```
+#### Available Tags
+The Docker images for docTR follow a specific tag nomenclature: `<framework>-py<python_version>-<system>-<doctr_version|YYYY-MM>`. Here's a breakdown of the tag structure:
+- `<framework>`: `tf` (TensorFlow) or `torch` (PyTorch).
+- `<python_version>`: `3.8.18`, `3.9.18`, or `3.10.13`.
+- `<system>`: `cpu` or `gpu`
+- `<doctr_version>`: a tag >= `v0.7.1`
+- `<YYYY-MM>`: e.g. `2023-09`
+Here are examples of different image tags:
+| Tag                        | Description                                       |
+|----------------------------|---------------------------------------------------|
+| `tf-py3.8.18-cpu-v0.7.1`       | TensorFlow version `3.8.18` with docTR `v0.7.1`. |
+| `torch-py3.9.18-gpu-2023-09`| PyTorch version `3.9.18` with GPU support and a monthly build from `2023-09`. |
+#### Building Docker Images Locally
+You can also build docTR Docker images locally on your computer.
+```shell
+docker build -t doctr .
+```
+You can specify custom Python versions and docTR versions using build arguments. For example, to build a docTR image with TensorFlow, Python version `3.9.10`, and docTR version `v0.7.0`, run the following command:
+```shell
+docker build -t doctr --build-arg FRAMEWORK=tf --build-arg PYTHON_VERSION=3.9.10 --build-arg DOCTR_VERSION=v0.7.0 .
+```
+### Example script
+An example script is provided for a simple documentation analysis of a PDF or image file:
+```shell
+python scripts/analyze.py path/to/your/doc.pdf
+```
+All script arguments can be checked using `python scripts/analyze.py --help`
+### Minimal API integration
+Looking to integrate docTR into your API? Here is a template to get you started with a fully working API using the wonderful [FastAPI](https://github.com/tiangolo/fastapi) framework.
+#### Deploy your API locally
+Specific dependencies are required to run the API template, which you can install as follows:
+```shell
+cd api/
+pip install poetry
+make lock
+pip install -r requirements.txt
+```
+You can now run your API locally:
+```shell
+uvicorn --reload --workers 1 --host 0.0.0.0 --port=8002 --app-dir api/ app.main:app
+```
+Alternatively, you can run the same server on a docker container if you prefer using:
+```shell
+PORT=8002 docker-compose up -d --build
+```
+#### What you have deployed
+Your API should now be running locally on your port 8002. Access your automatically-built documentation at [http://localhost:8002/redoc](http://localhost:8002/redoc) and enjoy your three functional routes ("/detection", "/recognition", "/ocr", "/kie"). Here is an example with Python to send a request to the OCR route:
+```python
+import requests
+with open('/path/to/your/doc.jpg', 'rb') as f:
+    data = f.read()
+response = requests.post("http://localhost:8002/ocr", files={'file': data}).json()
+```
+### Example notebooks
+Looking for more illustrations of docTR features? You might want to check the [Jupyter notebooks](https://github.com/mindee/doctr/tree/main/notebooks) designed to give you a broader overview.
+## Citation
+If you wish to cite this project, feel free to use this [BibTeX](http://www.bibtex.org/) reference:
+```bibtex
+@misc{doctr2021,
+    title={docTR: Document Text Recognition},
+    author={Mindee},
+    year={2021},
+    publisher = {GitHub},
+    howpublished = {\url{https://github.com/mindee/doctr}}
+}
+```
+## Contributing
+If you scrolled down to this section, you most likely appreciate open source. Do you feel like extending the range of our supported characters? Or perhaps submitting a paper implementation? Or contributing in any other way?
+You're in luck, we compiled a short guide (cf. [`CONTRIBUTING`](https://mindee.github.io/doctr/contributing/contributing.html)) for you to easily do so!
+## License
+Distributed under the Apache 2.0 License. See [`LICENSE`](https://github.com/mindee/doctr?tab=Apache-2.0-1-ov-file#readme) for more information.

backend/pytorch.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import numpy as np
+import torch
+from doctr.models import ocr_predictor
+from doctr.models.predictor import OCRPredictor
+DET_ARCHS = [
+    "db_resnet50",
+    "db_resnet34",
+    "db_mobilenet_v3_large",
+    "linknet_resnet18",
+    "linknet_resnet34",
+    "linknet_resnet50",
+    "fast_tiny",
+    "fast_small",
+    "fast_base",
+]
+RECO_ARCHS = [
+    "crnn_vgg16_bn",
+    "crnn_mobilenet_v3_small",
+    "crnn_mobilenet_v3_large",
+    "master",
+    "sar_resnet31",
+    "vitstr_small",
+    "vitstr_base",
+    "parseq",
+]
+def load_predictor(
+    det_arch: str,
+    reco_arch: str,
+    assume_straight_pages: bool,
+    straighten_pages: bool,
+    bin_thresh: float,
+    box_thresh: float,
+    device: torch.device,
+) -> OCRPredictor:
+    """Load a predictor from doctr.models
+    Args:
+    ----
+        det_arch: detection architecture
+        reco_arch: recognition architecture
+        assume_straight_pages: whether to assume straight pages or not
+        straighten_pages: whether to straighten rotated pages or not
+        bin_thresh: binarization threshold for the segmentation map
+        box_thresh: minimal objectness score to consider a box
+        device: torch.device, the device to load the predictor on
+    Returns:
+    -------
+        instance of OCRPredictor
+    """
+    predictor = ocr_predictor(
+        det_arch,
+        reco_arch,
+        pretrained=True,
+        assume_straight_pages=assume_straight_pages,
+        straighten_pages=straighten_pages,
+        export_as_straight_boxes=straighten_pages,
+        detect_orientation=not assume_straight_pages,
+    ).to(device)
+    predictor.det_predictor.model.postprocessor.bin_thresh = bin_thresh
+    predictor.det_predictor.model.postprocessor.box_thresh = box_thresh
+    return predictor
+def forward_image(predictor: OCRPredictor, image: np.ndarray, device: torch.device) -> np.ndarray:
+    """Forward an image through the predictor
+    Args:
+    ----
+        predictor: instance of OCRPredictor
+        image: image to process
+        device: torch.device, the device to process the image on
+    Returns:
+    -------
+        segmentation map
+    """
+    with torch.no_grad():
+        processed_batches = predictor.det_predictor.pre_processor([image])
+        out = predictor.det_predictor.model(processed_batches[0].to(device), return_model_output=True)
+        seg_map = out["out_map"].to("cpu").numpy()
+    return seg_map

doctr/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import io, models, datasets, transforms, utils
+from .file_utils import is_tf_available, is_torch_available
+from .version import __version__  # noqa: F401

doctr/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from doctr.file_utils import is_tf_available
+from .generator import *
+from .cord import *
+from .detection import *
+from .doc_artefacts import *
+from .funsd import *
+from .ic03 import *
+from .ic13 import *
+from .iiit5k import *
+from .iiithws import *
+from .imgur5k import *
+from .mjsynth import *
+from .ocr import *
+from .recognition import *
+from .orientation import *
+from .sroie import *
+from .svhn import *
+from .svt import *
+from .synthtext import *
+from .utils import *
+from .vocabs import *
+from .wildreceipt import *
+if is_tf_available():
+    from .loader import *

doctr/datasets/cord.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["CORD"]
+class CORD(VisionDataset):
+    """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing"
+    <https://openreview.net/pdf?id=SJl3z659UH>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import CORD
+    >>> train_set = CORD(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    TRAIN = (
+        "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0",
+        "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8",
+        "cord_train.zip",
+    )
+    TEST = (
+        "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0",
+        "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58",
+        "cord_test.zip",
+    )
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        url, sha256, name = self.TRAIN if train else self.TEST
+        super().__init__(
+            url,
+            name,
+            sha256,
+            True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        # List images
+        tmp_root = os.path.join(self.root, "image")
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.train = train
+        np_dtype = np.float32
+        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, img_path)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
+            stem = Path(img_path).stem
+            _targets = []
+            with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
+                label = json.load(f)
+                for line in label["valid_line"]:
+                    for word in line["words"]:
+                        if len(word["text"]) > 0:
+                            x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
+                            y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
+                            box: Union[List[float], np.ndarray]
+                            if use_polygons:
+                                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                                box = np.array(
+                                    [
+                                        [x[0], y[0]],
+                                        [x[1], y[1]],
+                                        [x[2], y[2]],
+                                        [x[3], y[3]],
+                                    ],
+                                    dtype=np_dtype,
+                                )
+                            else:
+                                # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
+                                box = [min(x), min(y), max(x), max(y)]
+                            _targets.append((word["text"], box))
+            text_targets, box_targets = zip(*_targets)
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    self.data.append((crop, label))
+            else:
+                self.data.append((
+                    img_path,
+                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
+                ))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from doctr.file_utils import is_tf_available, is_torch_available
+if is_tf_available():
+    from .tensorflow import *
+elif is_torch_available():
+    from .pytorch import *  # type: ignore[assignment]

doctr/datasets/datasets/base.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+import shutil
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Tuple, Union
+import numpy as np
+from doctr.io.image import get_img_shape
+from doctr.utils.data import download_from_url
+from ...models.utils import _copy_tensor
+__all__ = ["_AbstractDataset", "_VisionDataset"]
+class _AbstractDataset:
+    data: List[Any] = []
+    _pre_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None
+    def __init__(
+        self,
+        root: Union[str, Path],
+        img_transforms: Optional[Callable[[Any], Any]] = None,
+        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
+        pre_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
+    ) -> None:
+        if not Path(root).is_dir():
+            raise ValueError(f"expected a path to a reachable folder: {root}")
+        self.root = root
+        self.img_transforms = img_transforms
+        self.sample_transforms = sample_transforms
+        self._pre_transforms = pre_transforms
+        self._get_img_shape = get_img_shape
+    def __len__(self) -> int:
+        return len(self.data)
+    def _read_sample(self, index: int) -> Tuple[Any, Any]:
+        raise NotImplementedError
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        # Read image
+        img, target = self._read_sample(index)
+        # Pre-transforms (format conversion at run-time etc.)
+        if self._pre_transforms is not None:
+            img, target = self._pre_transforms(img, target)
+        if self.img_transforms is not None:
+            # typing issue cf. https://github.com/python/mypy/issues/5485
+            img = self.img_transforms(img)
+        if self.sample_transforms is not None:
+            # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
+            if (
+                isinstance(target, dict)
+                and all(isinstance(item, np.ndarray) for item in target.values())
+                and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
+            ):
+                img_transformed = _copy_tensor(img)
+                for class_name, bboxes in target.items():
+                    img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
+                img = img_transformed
+            else:
+                img, target = self.sample_transforms(img, target)
+        return img, target
+    def extra_repr(self) -> str:
+        return ""
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}({self.extra_repr()})"
+class _VisionDataset(_AbstractDataset):
+    """Implements an abstract dataset
+    Args:
+    ----
+        url: URL of the dataset
+        file_name: name of the file once downloaded
+        file_hash: expected SHA256 of the file
+        extract_archive: whether the downloaded file is an archive to be extracted
+        download: whether the dataset should be downloaded if not present on disk
+        overwrite: whether the archive should be re-extracted
+        cache_dir: cache directory
+        cache_subdir: subfolder to use in the cache
+    """
+    def __init__(
+        self,
+        url: str,
+        file_name: Optional[str] = None,
+        file_hash: Optional[str] = None,
+        extract_archive: bool = False,
+        download: bool = False,
+        overwrite: bool = False,
+        cache_dir: Optional[str] = None,
+        cache_subdir: Optional[str] = None,
+        **kwargs: Any,
+    ) -> None:
+        cache_dir = (
+            str(os.environ.get("DOCTR_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "doctr")))
+            if cache_dir is None
+            else cache_dir
+        )
+        cache_subdir = "datasets" if cache_subdir is None else cache_subdir
+        file_name = file_name if isinstance(file_name, str) else os.path.basename(url)
+        # Download the file if not present
+        archive_path: Union[str, Path] = os.path.join(cache_dir, cache_subdir, file_name)
+        if not os.path.exists(archive_path) and not download:
+            raise ValueError("the dataset needs to be downloaded first with download=True")
+        archive_path = download_from_url(url, file_name, file_hash, cache_dir=cache_dir, cache_subdir=cache_subdir)
+        # Extract the archive
+        if extract_archive:
+            archive_path = Path(archive_path)
+            dataset_path = archive_path.parent.joinpath(archive_path.stem)
+            if not dataset_path.is_dir() or overwrite:
+                shutil.unpack_archive(archive_path, dataset_path)
+        super().__init__(dataset_path if extract_archive else archive_path, **kwargs)

doctr/datasets/datasets/pytorch.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from copy import deepcopy
+from typing import Any, List, Tuple
+import numpy as np
+import torch
+from doctr.io import read_img_as_tensor, tensor_from_numpy
+from .base import _AbstractDataset, _VisionDataset
+__all__ = ["AbstractDataset", "VisionDataset"]
+class AbstractDataset(_AbstractDataset):
+    """Abstract class for all datasets"""
+    def _read_sample(self, index: int) -> Tuple[torch.Tensor, Any]:
+        img_name, target = self.data[index]
+        # Check target
+        if isinstance(target, dict):
+            assert "boxes" in target, "Target should contain 'boxes' key"
+            assert "labels" in target, "Target should contain 'labels' key"
+        elif isinstance(target, tuple):
+            assert len(target) == 2
+            assert isinstance(target[0], str) or isinstance(
+                target[0], np.ndarray
+            ), "first element of the tuple should be a string or a numpy array"
+            assert isinstance(target[1], list), "second element of the tuple should be a list"
+        else:
+            assert isinstance(target, str) or isinstance(
+                target, np.ndarray
+            ), "Target should be a string or a numpy array"
+        # Read image
+        img = (
+            tensor_from_numpy(img_name, dtype=torch.float32)
+            if isinstance(img_name, np.ndarray)
+            else read_img_as_tensor(os.path.join(self.root, img_name), dtype=torch.float32)
+        )
+        return img, deepcopy(target)
+    @staticmethod
+    def collate_fn(samples: List[Tuple[torch.Tensor, Any]]) -> Tuple[torch.Tensor, List[Any]]:
+        images, targets = zip(*samples)
+        images = torch.stack(images, dim=0)  # type: ignore[assignment]
+        return images, list(targets)  # type: ignore[return-value]
+class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101
+    pass

doctr/datasets/datasets/tensorflow.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from copy import deepcopy
+from typing import Any, List, Tuple
+import numpy as np
+import tensorflow as tf
+from doctr.io import read_img_as_tensor, tensor_from_numpy
+from .base import _AbstractDataset, _VisionDataset
+__all__ = ["AbstractDataset", "VisionDataset"]
+class AbstractDataset(_AbstractDataset):
+    """Abstract class for all datasets"""
+    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
+        img_name, target = self.data[index]
+        # Check target
+        if isinstance(target, dict):
+            assert "boxes" in target, "Target should contain 'boxes' key"
+            assert "labels" in target, "Target should contain 'labels' key"
+        elif isinstance(target, tuple):
+            assert len(target) == 2
+            assert isinstance(target[0], str) or isinstance(
+                target[0], np.ndarray
+            ), "first element of the tuple should be a string or a numpy array"
+            assert isinstance(target[1], list), "second element of the tuple should be a list"
+        else:
+            assert isinstance(target, str) or isinstance(
+                target, np.ndarray
+            ), "Target should be a string or a numpy array"
+        # Read image
+        img = (
+            tensor_from_numpy(img_name, dtype=tf.float32)
+            if isinstance(img_name, np.ndarray)
+            else read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float32)
+        )
+        return img, deepcopy(target)
+    @staticmethod
+    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
+        images, targets = zip(*samples)
+        images = tf.stack(images, axis=0)
+        return images, list(targets)
+class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101
+    pass

doctr/datasets/detection.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from typing import Any, Dict, List, Tuple, Type, Union
+import numpy as np
+from doctr.file_utils import CLASS_NAME
+from .datasets import AbstractDataset
+from .utils import pre_transform_multiclass
+__all__ = ["DetectionDataset"]
+class DetectionDataset(AbstractDataset):
+    """Implements a text detection dataset
+    >>> from doctr.datasets import DetectionDataset
+    >>> train_set = DetectionDataset(img_folder="/path/to/images",
+    >>>                              label_path="/path/to/labels.json")
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations of each image
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        use_polygons: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder,
+            pre_transforms=pre_transform_multiclass,
+            **kwargs,
+        )
+        # File existence check
+        self._class_names: List = []
+        if not os.path.exists(label_path):
+            raise FileNotFoundError(f"unable to locate {label_path}")
+        with open(label_path, "rb") as f:
+            labels = json.load(f)
+        self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = []
+        np_dtype = np.float32
+        for img_name, label in labels.items():
+            # File existence check
+            if not os.path.exists(os.path.join(self.root, img_name)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
+            self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
+    def format_polygons(
+        self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type
+    ) -> Tuple[np.ndarray, List[str]]:
+        """Format polygons into an array
+        Args:
+        ----
+            polygons: the bounding boxes
+            use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+            np_dtype: dtype of array
+        Returns:
+        -------
+            geoms: bounding boxes as np array
+            polygons_classes: list of classes for each bounding box
+        """
+        if isinstance(polygons, list):
+            self._class_names += [CLASS_NAME]
+            polygons_classes = [CLASS_NAME for _ in polygons]
+            _polygons: np.ndarray = np.asarray(polygons, dtype=np_dtype)
+        elif isinstance(polygons, dict):
+            self._class_names += list(polygons.keys())
+            polygons_classes = [k for k, v in polygons.items() for _ in v]
+            _polygons = np.concatenate([np.asarray(poly, dtype=np_dtype) for poly in polygons.values() if poly], axis=0)
+        else:
+            raise TypeError(f"polygons should be a dictionary or list, it was {type(polygons)}")
+        geoms = _polygons if use_polygons else np.concatenate((_polygons.min(axis=1), _polygons.max(axis=1)), axis=1)
+        return geoms, polygons_classes
+    @property
+    def class_names(self):
+        return sorted(set(self._class_names))

doctr/datasets/doc_artefacts.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from typing import Any, Dict, List, Tuple
+import numpy as np
+from .datasets import VisionDataset
+__all__ = ["DocArtefacts"]
+class DocArtefacts(VisionDataset):
+    """Object detection dataset for non-textual elements in documents.
+    The dataset includes a variety of synthetic document pages with non-textual elements.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/artefacts-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import DocArtefacts
+    >>> train_set = DocArtefacts(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    URL = "https://doctr-static.mindee.com/models?id=v0.4.0/artefact_detection-13fab8ce.zip&src=0"
+    SHA256 = "13fab8ced7f84583d9dccd0c634f046c3417e62a11fe1dea6efbbaba5052471b"
+    CLASSES = ["background", "qr_code", "bar_code", "logo", "photo"]
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(self.URL, None, self.SHA256, True, **kwargs)
+        self.train = train
+        # Update root
+        self.root = os.path.join(self.root, "train" if train else "val")
+        # List images
+        tmp_root = os.path.join(self.root, "images")
+        with open(os.path.join(self.root, "labels.json"), "rb") as f:
+            labels = json.load(f)
+        self.data: List[Tuple[str, Dict[str, Any]]] = []
+        img_list = os.listdir(tmp_root)
+        if len(labels) != len(img_list):
+            raise AssertionError("the number of images and labels do not match")
+        np_dtype = np.float32
+        for img_name, label in labels.items():
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, img_name)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
+            # xmin, ymin, xmax, ymax
+            boxes: np.ndarray = np.asarray([obj["geometry"] for obj in label], dtype=np_dtype)
+            classes: np.ndarray = np.asarray([self.CLASSES.index(obj["label"]) for obj in label], dtype=np.int64)
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                boxes = np.stack(
+                    [
+                        np.stack([boxes[:, 0], boxes[:, 1]], axis=-1),
+                        np.stack([boxes[:, 2], boxes[:, 1]], axis=-1),
+                        np.stack([boxes[:, 2], boxes[:, 3]], axis=-1),
+                        np.stack([boxes[:, 0], boxes[:, 3]], axis=-1),
+                    ],
+                    axis=1,
+                )
+            self.data.append((img_name, dict(boxes=boxes, labels=classes)))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/funsd.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["FUNSD"]
+class FUNSD(VisionDataset):
+    """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents"
+    <https://arxiv.org/pdf/1905.13538.pdf>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import FUNSD
+    >>> train_set = FUNSD(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip"
+    SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f"
+    FILE_NAME = "funsd.zip"
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            self.URL,
+            self.FILE_NAME,
+            self.SHA256,
+            True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        np_dtype = np.float32
+        # Use the subset
+        subfolder = os.path.join("dataset", "training_data" if train else "testing_data")
+        # # List images
+        tmp_root = os.path.join(self.root, subfolder, "images")
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, img_path)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
+            stem = Path(img_path).stem
+            with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f:
+                data = json.load(f)
+            _targets = [
+                (word["text"], word["box"])
+                for block in data["form"]
+                for word in block["words"]
+                if len(word["text"]) > 0
+            ]
+            text_targets, box_targets = zip(*_targets)
+            if use_polygons:
+                # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                box_targets = [  # type: ignore[assignment]
+                    [
+                        [box[0], box[1]],
+                        [box[2], box[1]],
+                        [box[2], box[3]],
+                        [box[0], box[3]],
+                    ]
+                    for box in box_targets
+                ]
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    # filter labels with unknown characters
+                    if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
+                        self.data.append((crop, label))
+            else:
+                self.data.append((
+                    img_path,
+                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)),
+                ))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/generator/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from doctr.file_utils import is_tf_available, is_torch_available
+if is_tf_available():
+    from .tensorflow import *
+elif is_torch_available():
+    from .pytorch import *  # type: ignore[assignment]

doctr/datasets/generator/base.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import random
+from typing import Any, Callable, List, Optional, Tuple, Union
+from PIL import Image, ImageDraw
+from doctr.io.image import tensor_from_pil
+from doctr.utils.fonts import get_font
+from ..datasets import AbstractDataset
+def synthesize_text_img(
+    text: str,
+    font_size: int = 32,
+    font_family: Optional[str] = None,
+    background_color: Optional[Tuple[int, int, int]] = None,
+    text_color: Optional[Tuple[int, int, int]] = None,
+) -> Image.Image:
+    """Generate a synthetic text image
+    Args:
+    ----
+        text: the text to render as an image
+        font_size: the size of the font
+        font_family: the font family (has to be installed on your system)
+        background_color: background color of the final image
+        text_color: text color on the final image
+    Returns:
+    -------
+        PIL image of the text
+    """
+    background_color = (0, 0, 0) if background_color is None else background_color
+    text_color = (255, 255, 255) if text_color is None else text_color
+    font = get_font(font_family, font_size)
+    left, top, right, bottom = font.getbbox(text)
+    text_w, text_h = right - left, bottom - top
+    h, w = int(round(1.3 * text_h)), int(round(1.1 * text_w))
+    # If single letter, make the image square, otherwise expand to meet the text size
+    img_size = (h, w) if len(text) > 1 else (max(h, w), max(h, w))
+    img = Image.new("RGB", img_size[::-1], color=background_color)
+    d = ImageDraw.Draw(img)
+    # Offset so that the text is centered
+    text_pos = (int(round((img_size[1] - text_w) / 2)), int(round((img_size[0] - text_h) / 2)))
+    # Draw the text
+    d.text(text_pos, text, font=font, fill=text_color)
+    return img
+class _CharacterGenerator(AbstractDataset):
+    def __init__(
+        self,
+        vocab: str,
+        num_samples: int,
+        cache_samples: bool = False,
+        font_family: Optional[Union[str, List[str]]] = None,
+        img_transforms: Optional[Callable[[Any], Any]] = None,
+        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
+    ) -> None:
+        self.vocab = vocab
+        self._num_samples = num_samples
+        self.font_family = font_family if isinstance(font_family, list) else [font_family]  # type: ignore[list-item]
+        # Validate fonts
+        if isinstance(font_family, list):
+            for font in self.font_family:
+                try:
+                    _ = get_font(font, 10)
+                except OSError:
+                    raise ValueError(f"unable to locate font: {font}")
+        self.img_transforms = img_transforms
+        self.sample_transforms = sample_transforms
+        self._data: List[Image.Image] = []
+        if cache_samples:
+            self._data = [
+                (synthesize_text_img(char, font_family=font), idx)  # type: ignore[misc]
+                for idx, char in enumerate(self.vocab)
+                for font in self.font_family
+            ]
+    def __len__(self) -> int:
+        return self._num_samples
+    def _read_sample(self, index: int) -> Tuple[Any, int]:
+        # Samples are already cached
+        if len(self._data) > 0:
+            idx = index % len(self._data)
+            pil_img, target = self._data[idx]  # type: ignore[misc]
+        else:
+            target = index % len(self.vocab)
+            pil_img = synthesize_text_img(self.vocab[target], font_family=random.choice(self.font_family))
+        img = tensor_from_pil(pil_img)
+        return img, target
+class _WordGenerator(AbstractDataset):
+    def __init__(
+        self,
+        vocab: str,
+        min_chars: int,
+        max_chars: int,
+        num_samples: int,
+        cache_samples: bool = False,
+        font_family: Optional[Union[str, List[str]]] = None,
+        img_transforms: Optional[Callable[[Any], Any]] = None,
+        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
+    ) -> None:
+        self.vocab = vocab
+        self.wordlen_range = (min_chars, max_chars)
+        self._num_samples = num_samples
+        self.font_family = font_family if isinstance(font_family, list) else [font_family]  # type: ignore[list-item]
+        # Validate fonts
+        if isinstance(font_family, list):
+            for font in self.font_family:
+                try:
+                    _ = get_font(font, 10)
+                except OSError:
+                    raise ValueError(f"unable to locate font: {font}")
+        self.img_transforms = img_transforms
+        self.sample_transforms = sample_transforms
+        self._data: List[Image.Image] = []
+        if cache_samples:
+            _words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
+            self._data = [
+                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text)  # type: ignore[misc]
+                for text in _words
+            ]
+    def _generate_string(self, min_chars: int, max_chars: int) -> str:
+        num_chars = random.randint(min_chars, max_chars)
+        return "".join(random.choice(self.vocab) for _ in range(num_chars))
+    def __len__(self) -> int:
+        return self._num_samples
+    def _read_sample(self, index: int) -> Tuple[Any, str]:
+        # Samples are already cached
+        if len(self._data) > 0:
+            pil_img, target = self._data[index]  # type: ignore[misc]
+        else:
+            target = self._generate_string(*self.wordlen_range)
+            pil_img = synthesize_text_img(target, font_family=random.choice(self.font_family))
+        img = tensor_from_pil(pil_img)
+        return img, target

doctr/datasets/generator/pytorch.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from torch.utils.data._utils.collate import default_collate
+from .base import _CharacterGenerator, _WordGenerator
+__all__ = ["CharacterGenerator", "WordGenerator"]
+class CharacterGenerator(_CharacterGenerator):
+    """Implements a character image generation dataset
+    >>> from doctr.datasets import CharacterGenerator
+    >>> ds = CharacterGenerator(vocab='abdef', num_samples=100)
+    >>> img, target = ds[0]
+    Args:
+    ----
+        vocab: vocabulary to take the character from
+        num_samples: number of samples that will be generated iterating over the dataset
+        cache_samples: whether generated images should be cached firsthand
+        font_family: font to use to generate the text images
+        img_transforms: composable transformations that will be applied to each image
+        sample_transforms: composable transformations that will be applied to both the image and the target
+    """
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        setattr(self, "collate_fn", default_collate)
+class WordGenerator(_WordGenerator):
+    """Implements a character image generation dataset
+    >>> from doctr.datasets import WordGenerator
+    >>> ds = WordGenerator(vocab='abdef', min_chars=1, max_chars=32, num_samples=100)
+    >>> img, target = ds[0]
+    Args:
+    ----
+        vocab: vocabulary to take the character from
+        min_chars: minimum number of characters in a word
+        max_chars: maximum number of characters in a word
+        num_samples: number of samples that will be generated iterating over the dataset
+        cache_samples: whether generated images should be cached firsthand
+        font_family: font to use to generate the text images
+        img_transforms: composable transformations that will be applied to each image
+        sample_transforms: composable transformations that will be applied to both the image and the target
+    """
+    pass

doctr/datasets/generator/tensorflow.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import tensorflow as tf
+from .base import _CharacterGenerator, _WordGenerator
+__all__ = ["CharacterGenerator", "WordGenerator"]
+class CharacterGenerator(_CharacterGenerator):
+    """Implements a character image generation dataset
+    >>> from doctr.datasets import CharacterGenerator
+    >>> ds = CharacterGenerator(vocab='abdef', num_samples=100)
+    >>> img, target = ds[0]
+    Args:
+    ----
+        vocab: vocabulary to take the character from
+        num_samples: number of samples that will be generated iterating over the dataset
+        cache_samples: whether generated images should be cached firsthand
+        font_family: font to use to generate the text images
+        img_transforms: composable transformations that will be applied to each image
+        sample_transforms: composable transformations that will be applied to both the image and the target
+    """
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+    @staticmethod
+    def collate_fn(samples):
+        images, targets = zip(*samples)
+        images = tf.stack(images, axis=0)
+        return images, tf.convert_to_tensor(targets)
+class WordGenerator(_WordGenerator):
+    """Implements a character image generation dataset
+    >>> from doctr.datasets import WordGenerator
+    >>> ds = WordGenerator(vocab='abdef', min_chars=1, max_chars=32, num_samples=100)
+    >>> img, target = ds[0]
+    Args:
+    ----
+        vocab: vocabulary to take the character from
+        min_chars: minimum number of characters in a word
+        max_chars: maximum number of characters in a word
+        num_samples: number of samples that will be generated iterating over the dataset
+        cache_samples: whether generated images should be cached firsthand
+        font_family: font to use to generate the text images
+        img_transforms: composable transformations that will be applied to each image
+        sample_transforms: composable transformations that will be applied to both the image and the target
+    """
+    pass

doctr/datasets/ic03.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, Dict, List, Tuple, Union
+import defusedxml.ElementTree as ET
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["IC03"]
+class IC03(VisionDataset):
+    """IC03 dataset from `"ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions"
+    <http://www.iapr-tc11.org/mediawiki/index.php?title=ICDAR_2003_Robust_Reading_Competitions>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic03-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import IC03
+    >>> train_set = IC03(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    TRAIN = (
+        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/scene.zip",
+        "9d86df514eb09dd693fb0b8c671ef54a0cfe02e803b1bbef9fc676061502eb94",
+        "ic03_train.zip",
+    )
+    TEST = (
+        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTest/scene.zip",
+        "dbc4b5fd5d04616b8464a1b42ea22db351ee22c2546dd15ac35611857ea111f8",
+        "ic03_test.zip",
+    )
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        url, sha256, file_name = self.TRAIN if train else self.TEST
+        super().__init__(
+            url,
+            file_name,
+            sha256,
+            True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        # Load xml data
+        tmp_root = (
+            os.path.join(self.root, "SceneTrialTrain" if self.train else "SceneTrialTest") if sha256 else self.root
+        )
+        xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
+        xml_root = xml_tree.getroot()
+        for image in tqdm(iterable=xml_root, desc="Unpacking IC03", total=len(xml_root)):
+            name, _resolution, rectangles = image
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, name.text)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                _boxes = [
+                    [
+                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
+                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
+                        [
+                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
+                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
+                        ],
+                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
+                    ]
+                    for rect in rectangles
+                ]
+            else:
+                # x_min, y_min, x_max, y_max
+                _boxes = [
+                    [
+                        float(rect.attrib["x"]),  # type: ignore[list-item]
+                        float(rect.attrib["y"]),  # type: ignore[list-item]
+                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
+                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
+                    ]
+                    for rect in rectangles
+                ]
+            # filter images without boxes
+            if len(_boxes) > 0:
+                boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
+                # Get the labels
+                labels = [lab.text for rect in rectangles for lab in rect if lab.text]
+                if recognition_task:
+                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
+                    for crop, label in zip(crops, labels):
+                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                            self.data.append((crop, label))
+                else:
+                    self.data.append((name.text, dict(boxes=boxes, labels=labels)))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/ic13.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import csv
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from tqdm import tqdm
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["IC13"]
+class IC13(AbstractDataset):
+    """IC13 dataset from `"ICDAR 2013 Robust Reading Competition" <https://rrc.cvc.uab.es/>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic13-grid.png&src=0
+        :align: center
+    >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
+    >>> from doctr.datasets import IC13
+    >>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
+    >>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
+    >>> img, target = train_set[0]
+    >>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
+    >>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
+    >>> img, target = test_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_folder: folder with all annotation files for the images
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_folder: str,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        # File existence check
+        if not os.path.exists(label_folder) or not os.path.exists(img_folder):
+            raise FileNotFoundError(
+                f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
+            )
+        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        img_names = os.listdir(img_folder)
+        for img_name in tqdm(iterable=img_names, desc="Unpacking IC13", total=len(img_names)):
+            img_path = Path(img_folder, img_name)
+            label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
+            with open(label_path, newline="\n") as f:
+                _lines = [
+                    [val[:-1] if val.endswith(",") else val for val in row]
+                    for row in csv.reader(f, delimiter=" ", quotechar="'")
+                ]
+            labels = [line[-1].replace('"', "") for line in _lines]
+            # xmin, ymin, xmax, ymax
+            box_targets: np.ndarray = np.array([list(map(int, line[:4])) for line in _lines], dtype=np_dtype)
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                box_targets = np.array(
+                    [
+                        [
+                            [coords[0], coords[1]],
+                            [coords[2], coords[1]],
+                            [coords[2], coords[3]],
+                            [coords[0], coords[3]],
+                        ]
+                        for coords in box_targets
+                    ],
+                    dtype=np_dtype,
+                )
+            if recognition_task:
+                crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
+                for crop, label in zip(crops, labels):
+                    self.data.append((crop, label))
+            else:
+                self.data.append((img_path, dict(boxes=box_targets, labels=labels)))

doctr/datasets/iiit5k.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+import scipy.io as sio
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative
+__all__ = ["IIIT5K"]
+class IIIT5K(VisionDataset):
+    """IIIT-5K character-level localization dataset from
+    `"BMVC 2012 Scene Text Recognition using Higher Order Language Priors"
+    <https://cdn.iiit.ac.in/cdn/cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/home/mishraBMVC12.pdf>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/iiit5k-grid.png&src=0
+        :align: center
+    >>> # NOTE: this dataset is for character-level localization
+    >>> from doctr.datasets import IIIT5K
+    >>> train_set = IIIT5K(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    URL = "https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz"
+    SHA256 = "7872c9efbec457eb23f3368855e7738f72ce10927f52a382deb4966ca0ffa38e"
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            self.URL,
+            None,
+            file_hash=self.SHA256,
+            extract_archive=True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        # Load mat data
+        tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
+        mat_file = "trainCharBound" if self.train else "testCharBound"
+        mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
+            _raw_path = img_path[0]
+            _raw_label = label[0]
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, _raw_path)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
+            if recognition_task:
+                self.data.append((_raw_path, _raw_label))
+            else:
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box_targets = [
+                        [
+                            [box[0], box[1]],
+                            [box[0] + box[2], box[1]],
+                            [box[0] + box[2], box[1] + box[3]],
+                            [box[0], box[1] + box[3]],
+                        ]
+                        for box in box_targets
+                    ]
+                else:
+                    # xmin, ymin, xmax, ymax
+                    box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
+                # label are casted to list where each char corresponds to the character's bounding box
+                self.data.append((
+                    _raw_path,
+                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(_raw_label)),
+                ))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/iiithws.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from random import sample
+from typing import Any, List, Tuple
+from tqdm import tqdm
+from .datasets import AbstractDataset
+__all__ = ["IIITHWS"]
+class IIITHWS(AbstractDataset):
+    """IIITHWS dataset from `"Generating Synthetic Data for Text Recognition"
+    <https://arxiv.org/pdf/1608.04224.pdf>`_ | `"repository" <https://github.com/kris314/hwnet>`_ |
+    `"website" <https://cvit.iiit.ac.in/research/projects/cvit-projects/matchdocimgs>`_.
+    >>> # NOTE: This is a pure recognition dataset without bounding box labels.
+    >>> # NOTE: You need to download the dataset.
+    >>> from doctr.datasets import IIITHWS
+    >>> train_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
+    >>>                     label_path="/path/to/IIIT-HWS-90K.txt",
+    >>>                     train=True)
+    >>> img, target = train_set[0]
+    >>> test_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
+    >>>                    label_path="/path/to/IIIT-HWS-90K.txt")
+    >>>                    train=False)
+    >>> img, target = test_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_path: path to the file with the labels
+        train: whether the subset should be the training one
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(img_folder, **kwargs)
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+        self.data: List[Tuple[str, str]] = []
+        self.train = train
+        with open(label_path) as f:
+            annotations = f.readlines()
+        # Shuffle the dataset otherwise the test set will contain the same labels n times
+        annotations = sample(annotations, len(annotations))
+        train_samples = int(len(annotations) * 0.9)
+        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
+        for annotation in tqdm(
+            iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
+        ):
+            img_path, label = annotation.split()[0:2]
+            img_path = os.path.join(img_folder, img_path)
+            self.data.append((img_path, label))
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/imgur5k.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import glob
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["IMGUR5K"]
+class IMGUR5K(AbstractDataset):
+    """IMGUR5K dataset from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example"
+    <https://arxiv.org/abs/2106.08385>`_ |
+    `repository <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/imgur5k-grid.png&src=0
+        :align: center
+        :width: 630
+        :height: 400
+    >>> # NOTE: You need to download/generate the dataset from the repository.
+    >>> from doctr.datasets import IMGUR5K
+    >>> train_set = IMGUR5K(train=True, img_folder="/path/to/IMGUR5K-Handwriting-Dataset/images",
+    >>>                     label_path="/path/to/IMGUR5K-Handwriting-Dataset/dataset_info/imgur5k_annotations.json")
+    >>> img, target = train_set[0]
+    >>> test_set = IMGUR5K(train=False, img_folder="/path/to/IMGUR5K-Handwriting-Dataset/images",
+    >>>                    label_path="/path/to/IMGUR5K-Handwriting-Dataset/dataset_info/imgur5k_annotations.json")
+    >>> img, target = test_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations file of the dataset
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        self.train = train
+        np_dtype = np.float32
+        img_names = os.listdir(img_folder)
+        train_samples = int(len(img_names) * 0.9)
+        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
+        # define folder to write IMGUR5K recognition dataset
+        reco_folder_name = "IMGUR5K_recognition_train" if self.train else "IMGUR5K_recognition_test"
+        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
+        reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name)
+        reco_images_counter = 0
+        if recognition_task and os.path.isdir(reco_folder_path):
+            self._read_from_folder(reco_folder_path)
+            return
+        elif recognition_task and not os.path.isdir(reco_folder_path):
+            os.makedirs(reco_folder_path, exist_ok=False)
+        with open(label_path) as f:
+            annotation_file = json.load(f)
+        for img_name in tqdm(iterable=img_names[set_slice], desc="Unpacking IMGUR5K", total=len(img_names[set_slice])):
+            img_path = Path(img_folder, img_name)
+            img_id = img_name.split(".")[0]
+            # File existence check
+            if not os.path.exists(os.path.join(self.root, img_name)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            # some files have no annotations which are marked with only a dot in the 'word' key
+            # ref: https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset/blob/main/README.md
+            if img_id not in annotation_file["index_to_ann_map"].keys():
+                continue
+            ann_ids = annotation_file["index_to_ann_map"][img_id]
+            annotations = [annotation_file["ann_id"][a_id] for a_id in ann_ids]
+            labels = [ann["word"] for ann in annotations if ann["word"] != "."]
+            # x_center, y_center, width, height, angle
+            _boxes = [
+                list(map(float, ann["bounding_box"].strip("[ ]").split(", ")))
+                for ann in annotations
+                if ann["word"] != "."
+            ]
+            # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+            box_targets = [cv2.boxPoints(((box[0], box[1]), (box[2], box[3]), box[4])) for box in _boxes]  # type: ignore[arg-type]
+            if not use_polygons:
+                # xmin, ymin, xmax, ymax
+                box_targets = [np.concatenate((points.min(0), points.max(0)), axis=-1) for points in box_targets]
+            # filter images without boxes
+            if len(box_targets) > 0:
+                if recognition_task:
+                    crops = crop_bboxes_from_image(
+                        img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
+                    )
+                    for crop, label in zip(crops, labels):
+                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                            # write data to disk
+                            with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
+                                f.write(label)
+                                tmp_img = Image.fromarray(crop)
+                                tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
+                                reco_images_counter += 1
+                else:
+                    self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
+        if recognition_task:
+            self._read_from_folder(reco_folder_path)
+    def extra_repr(self) -> str:
+        return f"train={self.train}"
+    def _read_from_folder(self, path: str) -> None:
+        for img_path in glob.glob(os.path.join(path, "*.png")):
+            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
+                self.data.append((img_path, f.read()))

doctr/datasets/loader.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import math
+from typing import Callable, Optional
+import numpy as np
+import tensorflow as tf
+from doctr.utils.multithreading import multithread_exec
+__all__ = ["DataLoader"]
+def default_collate(samples):
+    """Collate multiple elements into batches
+    Args:
+    ----
+        samples: list of N tuples containing M elements
+    Returns:
+    -------
+        Tuple of M sequences contianing N elements each
+    """
+    batch_data = zip(*samples)
+    tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
+    return tf_data
+class DataLoader:
+    """Implements a dataset wrapper for fast data loading
+    >>> from doctr.datasets import CORD, DataLoader
+    >>> train_set = CORD(train=True, download=True)
+    >>> train_loader = DataLoader(train_set, batch_size=32)
+    >>> train_iter = iter(train_loader)
+    >>> images, targets = next(train_iter)
+    Args:
+    ----
+        dataset: the dataset
+        shuffle: whether the samples should be shuffled before passing it to the iterator
+        batch_size: number of elements in each batch
+        drop_last: if `True`, drops the last batch if it isn't full
+        num_workers: number of workers to use for data loading
+        collate_fn: function to merge samples into a batch
+    """
+    def __init__(
+        self,
+        dataset,
+        shuffle: bool = True,
+        batch_size: int = 1,
+        drop_last: bool = False,
+        num_workers: Optional[int] = None,
+        collate_fn: Optional[Callable] = None,
+    ) -> None:
+        self.dataset = dataset
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        nb = len(self.dataset) / batch_size
+        self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
+        if collate_fn is None:
+            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
+        else:
+            self.collate_fn = collate_fn
+        self.num_workers = num_workers
+        self.reset()
+    def __len__(self) -> int:
+        return self.num_batches
+    def reset(self) -> None:
+        # Updates indices after each epoch
+        self._num_yielded = 0
+        self.indices = np.arange(len(self.dataset))
+        if self.shuffle is True:
+            np.random.shuffle(self.indices)
+    def __iter__(self):
+        self.reset()
+        return self
+    def __next__(self):
+        if self._num_yielded < self.num_batches:
+            # Get next indices
+            idx = self._num_yielded * self.batch_size
+            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
+            samples = list(multithread_exec(self.dataset.__getitem__, indices, threads=self.num_workers))
+            batch_data = self.collate_fn(samples)
+            self._num_yielded += 1
+            return batch_data
+        else:
+            raise StopIteration

doctr/datasets/mjsynth.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, List, Tuple
+from tqdm import tqdm
+from .datasets import AbstractDataset
+__all__ = ["MJSynth"]
+class MJSynth(AbstractDataset):
+    """MJSynth dataset from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition"
+    <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
+    >>> # NOTE: This is a pure recognition dataset without bounding box labels.
+    >>> # NOTE: You need to download the dataset.
+    >>> from doctr.datasets import MJSynth
+    >>> train_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
+    >>>                     label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt",
+    >>>                     train=True)
+    >>> img, target = train_set[0]
+    >>> test_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
+    >>>                    label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt")
+    >>>                    train=False)
+    >>> img, target = test_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_path: path to the file with the labels
+        train: whether the subset should be the training one
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    # filter corrupted or missing images
+    BLACKLIST = [
+        "./1881/4/225_Marbling_46673.jpg\n",
+        "./2069/4/192_whittier_86389.jpg\n",
+        "./869/4/234_TRIASSIC_80582.jpg\n",
+        "./173/2/358_BURROWING_10395.jpg\n",
+        "./913/4/231_randoms_62372.jpg\n",
+        "./596/2/372_Ump_81662.jpg\n",
+        "./936/2/375_LOCALITIES_44992.jpg\n",
+        "./2540/4/246_SQUAMOUS_73902.jpg\n",
+        "./1332/4/224_TETHERED_78397.jpg\n",
+        "./627/6/83_PATRIARCHATE_55931.jpg\n",
+        "./2013/2/370_refract_63890.jpg\n",
+        "./2911/6/77_heretical_35885.jpg\n",
+        "./1730/2/361_HEREON_35880.jpg\n",
+        "./2194/2/334_EFFLORESCENT_24742.jpg\n",
+        "./2025/2/364_SNORTERS_72304.jpg\n",
+        "./368/4/232_friar_30876.jpg\n",
+        "./275/6/96_hackle_34465.jpg\n",
+        "./384/4/220_bolts_8596.jpg\n",
+        "./905/4/234_Postscripts_59142.jpg\n",
+        "./2749/6/101_Chided_13155.jpg\n",
+        "./495/6/81_MIDYEAR_48332.jpg\n",
+        "./2852/6/60_TOILSOME_79481.jpg\n",
+        "./554/2/366_Teleconferences_77948.jpg\n",
+        "./1696/4/211_Queened_61779.jpg\n",
+        "./2128/2/369_REDACTED_63458.jpg\n",
+        "./2557/2/351_DOWN_23492.jpg\n",
+        "./2489/4/221_snored_72290.jpg\n",
+        "./1650/2/355_stony_74902.jpg\n",
+        "./1863/4/223_Diligently_21672.jpg\n",
+        "./264/2/362_FORETASTE_30276.jpg\n",
+        "./429/4/208_Mainmasts_46140.jpg\n",
+        "./1817/2/363_actuating_904.jpg\n",
+    ]
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(img_folder, **kwargs)
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+        self.data: List[Tuple[str, str]] = []
+        self.train = train
+        with open(label_path) as f:
+            img_paths = f.readlines()
+        train_samples = int(len(img_paths) * 0.9)
+        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
+        for path in tqdm(iterable=img_paths[set_slice], desc="Unpacking MJSynth", total=len(img_paths[set_slice])):
+            if path not in self.BLACKLIST:
+                label = path.split("_")[1]
+                img_path = os.path.join(img_folder, path[2:]).strip()
+                self.data.append((img_path, label))
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/ocr.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+import numpy as np
+from .datasets import AbstractDataset
+__all__ = ["OCRDataset"]
+class OCRDataset(AbstractDataset):
+    """Implements an OCR dataset
+    >>> from doctr.datasets import OCRDataset
+    >>> train_set = OCRDataset(img_folder="/path/to/images",
+    >>>                        label_file="/path/to/labels.json")
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        img_folder: local path to image folder (all jpg at the root)
+        label_file: local path to the label file
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_file: str,
+        use_polygons: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(img_folder, **kwargs)
+        # List images
+        self.data: List[Tuple[str, Dict[str, Any]]] = []
+        np_dtype = np.float32
+        with open(label_file, "rb") as f:
+            data = json.load(f)
+        for img_name, annotations in data.items():
+            # Get image path
+            img_name = Path(img_name)
+            # File existence check
+            if not os.path.exists(os.path.join(self.root, img_name)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            # handle empty images
+            if len(annotations["typed_words"]) == 0:
+                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[])))
+                continue
+            # Unpack the straight boxes (xmin, ymin, xmax, ymax)
+            geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]]
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                geoms = [
+                    [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]]  # type: ignore[list-item]
+                    for geom in geoms
+                ]
+            text_targets = [obj["value"] for obj in annotations["typed_words"]]
+            self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))

doctr/datasets/orientation.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, List, Tuple
+import numpy as np
+from .datasets import AbstractDataset
+__all__ = ["OrientationDataset"]
+class OrientationDataset(AbstractDataset):
+    """Implements a basic image dataset where targets are filled with zeros.
+    >>> from doctr.datasets import OrientationDataset
+    >>> train_set = OrientationDataset(img_folder="/path/to/images")
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder,
+            **kwargs,
+        )
+        # initialize dataset with 0 degree rotation targets
+        self.data: List[Tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]

doctr/datasets/recognition.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any, List, Tuple
+from .datasets import AbstractDataset
+__all__ = ["RecognitionDataset"]
+class RecognitionDataset(AbstractDataset):
+    """Dataset implementation for text recognition tasks
+    >>> from doctr.datasets import RecognitionDataset
+    >>> train_set = RecognitionDataset(img_folder="/path/to/images",
+    >>>                                labels_path="/path/to/labels.json")
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        img_folder: path to the images folder
+        labels_path: pathe to the json file containing all labels (character sequences)
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        labels_path: str,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(img_folder, **kwargs)
+        self.data: List[Tuple[str, str]] = []
+        with open(labels_path, encoding="utf-8") as f:
+            labels = json.load(f)
+        for img_name, label in labels.items():
+            if not os.path.exists(os.path.join(self.root, img_name)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
+            self.data.append((img_name, label))
+    def merge_dataset(self, ds: AbstractDataset) -> None:
+        # Update data with new root for self
+        self.data = [(str(Path(self.root).joinpath(img_path)), label) for img_path, label in self.data]
+        # Define new root
+        self.root = Path("/")
+        # Merge with ds data
+        for img_path, label in ds.data:
+            self.data.append((str(Path(ds.root).joinpath(img_path)), label))

doctr/datasets/sroie.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import csv
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["SROIE"]
+class SROIE(VisionDataset):
+    """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction"
+    <https://arxiv.org/pdf/2103.10213.pdf>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import SROIE
+    >>> train_set = SROIE(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    TRAIN = (
+        "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0",
+        "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f",
+        "sroie2019_train_task1.zip",
+    )
+    TEST = (
+        "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0",
+        "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2",
+        "sroie2019_test.zip",
+    )
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        url, sha256, name = self.TRAIN if train else self.TEST
+        super().__init__(
+            url,
+            name,
+            sha256,
+            True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        tmp_root = os.path.join(self.root, "images")
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))):
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, img_path)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
+            stem = Path(img_path).stem
+            with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f:
+                _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0]
+            labels = [",".join(row[8:]) for row in _rows]
+            # reorder coordinates (8 -> (4,2) ->
+            # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines
+            coords: np.ndarray = np.stack(
+                [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0
+            )
+            if not use_polygons:
+                # xmin, ymin, xmax, ymax
+                coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1)
+            if recognition_task:
+                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords)
+                for crop, label in zip(crops, labels):
+                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        self.data.append((crop, label))
+            else:
+                self.data.append((img_path, dict(boxes=coords, labels=labels)))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/svhn.py ADDED Viewed

	@@ -0,0 +1,131 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, Dict, List, Tuple, Union
+import h5py
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["SVHN"]
+class SVHN(VisionDataset):
+    """SVHN dataset from `"The Street View House Numbers (SVHN) Dataset"
+    <http://ufldl.stanford.edu/housenumbers/>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svhn-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import SVHN
+    >>> train_set = SVHN(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    TRAIN = (
+        "http://ufldl.stanford.edu/housenumbers/train.tar.gz",
+        "4b17bb33b6cd8f963493168f80143da956f28ec406cc12f8e5745a9f91a51898",
+        "svhn_train.tar",
+    )
+    TEST = (
+        "http://ufldl.stanford.edu/housenumbers/test.tar.gz",
+        "57ac9ceb530e4aa85b55d991be8fc49c695b3d71c6f6a88afea86549efde7fb5",
+        "svhn_test.tar",
+    )
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        url, sha256, name = self.TRAIN if train else self.TEST
+        super().__init__(
+            url,
+            file_name=name,
+            file_hash=sha256,
+            extract_archive=True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        tmp_root = os.path.join(self.root, "train" if train else "test")
+        # Load mat data (matlab v7.3 - can not be loaded with scipy)
+        with h5py.File(os.path.join(tmp_root, "digitStruct.mat"), "r") as f:
+            img_refs = f["digitStruct/name"]
+            box_refs = f["digitStruct/bbox"]
+            for img_ref, box_ref in tqdm(iterable=zip(img_refs, box_refs), desc="Unpacking SVHN", total=len(img_refs)):
+                # convert ascii matrix to string
+                img_name = "".join(map(chr, f[img_ref[0]][()].flatten()))
+                # File existence check
+                if not os.path.exists(os.path.join(tmp_root, img_name)):
+                    raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
+                # Unpack the information
+                box = f[box_ref[0]]
+                if box["left"].shape[0] == 1:
+                    box_dict = {k: [int(vals[0][0])] for k, vals in box.items()}
+                else:
+                    box_dict = {k: [int(f[v[0]][()].item()) for v in vals] for k, vals in box.items()}
+                # Convert it to the right format
+                coords: np.ndarray = np.array(
+                    [box_dict["left"], box_dict["top"], box_dict["width"], box_dict["height"]], dtype=np_dtype
+                ).transpose()
+                label_targets = list(map(str, box_dict["label"]))
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box_targets: np.ndarray = np.stack(
+                        [
+                            np.stack([coords[:, 0], coords[:, 1]], axis=-1),
+                            np.stack([coords[:, 0] + coords[:, 2], coords[:, 1]], axis=-1),
+                            np.stack([coords[:, 0] + coords[:, 2], coords[:, 1] + coords[:, 3]], axis=-1),
+                            np.stack([coords[:, 0], coords[:, 1] + coords[:, 3]], axis=-1),
+                        ],
+                        axis=1,
+                    )
+                else:
+                    # x, y, width, height -> xmin, ymin, xmax, ymax
+                    box_targets = np.stack(
+                        [
+                            coords[:, 0],
+                            coords[:, 1],
+                            coords[:, 0] + coords[:, 2],
+                            coords[:, 1] + coords[:, 3],
+                        ],
+                        axis=-1,
+                    )
+                if recognition_task:
+                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_name), geoms=box_targets)
+                    for crop, label in zip(crops, label_targets):
+                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                            self.data.append((crop, label))
+                else:
+                    self.data.append((img_name, dict(boxes=box_targets, labels=label_targets)))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/svt.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import os
+from typing import Any, Dict, List, Tuple, Union
+import defusedxml.ElementTree as ET
+import numpy as np
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["SVT"]
+class SVT(VisionDataset):
+    """SVT dataset from `"The Street View Text Dataset - UCSD Computer Vision"
+    <http://vision.ucsd.edu/~kai/svt/>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import SVT
+    >>> train_set = SVT(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    URL = "http://vision.ucsd.edu/~kai/svt/svt.zip"
+    SHA256 = "63b3d55e6b6d1e036e2a844a20c034fe3af3c32e4d914d6e0c4a3cd43df3bebf"
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            self.URL,
+            None,
+            self.SHA256,
+            True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        # Load xml data
+        tmp_root = os.path.join(self.root, "svt1") if self.SHA256 else self.root
+        xml_tree = (
+            ET.parse(os.path.join(tmp_root, "train.xml"))
+            if self.train
+            else ET.parse(os.path.join(tmp_root, "test.xml"))
+        )
+        xml_root = xml_tree.getroot()
+        for image in tqdm(iterable=xml_root, desc="Unpacking SVT", total=len(xml_root)):
+            name, _, _, _resolution, rectangles = image
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, name.text)):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
+            if use_polygons:
+                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                _boxes = [
+                    [
+                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
+                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
+                        [
+                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
+                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
+                        ],
+                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
+                    ]
+                    for rect in rectangles
+                ]
+            else:
+                # x_min, y_min, x_max, y_max
+                _boxes = [
+                    [
+                        float(rect.attrib["x"]),  # type: ignore[list-item]
+                        float(rect.attrib["y"]),  # type: ignore[list-item]
+                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
+                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
+                    ]
+                    for rect in rectangles
+                ]
+            boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
+            # Get the labels
+            labels = [lab.text for rect in rectangles for lab in rect]
+            if recognition_task:
+                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
+                for crop, label in zip(crops, labels):
+                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        self.data.append((crop, label))
+            else:
+                self.data.append((name.text, dict(boxes=boxes, labels=labels)))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/datasets/synthtext.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import glob
+import os
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from PIL import Image
+from scipy import io as sio
+from tqdm import tqdm
+from .datasets import VisionDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["SynthText"]
+class SynthText(VisionDataset):
+    """SynthText dataset from `"Synthetic Data for Text Localisation in Natural Images"
+    <https://arxiv.org/abs/1604.06646>`_ | `"repository" <https://github.com/ankush-me/SynthText>`_ |
+    `"website" <https://www.robots.ox.ac.uk/~vgg/data/scenetext/>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
+        :align: center
+    >>> from doctr.datasets import SynthText
+    >>> train_set = SynthText(train=True, download=True)
+    >>> img, target = train_set[0]
+    Args:
+    ----
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `VisionDataset`.
+    """
+    URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip"
+    SHA256 = "28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1"
+    def __init__(
+        self,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            self.URL,
+            None,
+            file_hash=None,
+            extract_archive=True,
+            pre_transforms=convert_target_to_relative if not recognition_task else None,
+            **kwargs,
+        )
+        self.train = train
+        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        np_dtype = np.float32
+        # Load mat data
+        tmp_root = os.path.join(self.root, "SynthText") if self.SHA256 else self.root
+        # define folder to write SynthText recognition dataset
+        reco_folder_name = "SynthText_recognition_train" if self.train else "SynthText_recognition_test"
+        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
+        reco_folder_path = os.path.join(tmp_root, reco_folder_name)
+        reco_images_counter = 0
+        if recognition_task and os.path.isdir(reco_folder_path):
+            self._read_from_folder(reco_folder_path)
+            return
+        elif recognition_task and not os.path.isdir(reco_folder_path):
+            os.makedirs(reco_folder_path, exist_ok=False)
+        mat_data = sio.loadmat(os.path.join(tmp_root, "gt.mat"))
+        train_samples = int(len(mat_data["imnames"][0]) * 0.9)
+        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
+        paths = mat_data["imnames"][0][set_slice]
+        boxes = mat_data["wordBB"][0][set_slice]
+        labels = mat_data["txt"][0][set_slice]
+        del mat_data
+        for img_path, word_boxes, txt in tqdm(
+            iterable=zip(paths, boxes, labels), desc="Unpacking SynthText", total=len(paths)
+        ):
+            # File existence check
+            if not os.path.exists(os.path.join(tmp_root, img_path[0])):
+                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path[0])}")
+            labels = [elt for word in txt.tolist() for elt in word.split()]
+            # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+            word_boxes = (
+                word_boxes.transpose(2, 1, 0)
+                if word_boxes.ndim == 3
+                else np.expand_dims(word_boxes.transpose(1, 0), axis=0)
+            )
+            if not use_polygons:
+                # xmin, ymin, xmax, ymax
+                word_boxes = np.concatenate((word_boxes.min(axis=1), word_boxes.max(axis=1)), axis=1)
+            if recognition_task:
+                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path[0]), geoms=word_boxes)
+                for crop, label in zip(crops, labels):
+                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
+                        # write data to disk
+                        with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
+                            f.write(label)
+                            tmp_img = Image.fromarray(crop)
+                            tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
+                            reco_images_counter += 1
+            else:
+                self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))
+        if recognition_task:
+            self._read_from_folder(reco_folder_path)
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"
+    def _read_from_folder(self, path: str) -> None:
+        for img_path in glob.glob(os.path.join(path, "*.png")):
+            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
+                self.data.append((img_path, f.read()))

doctr/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import string
+import unicodedata
+from collections.abc import Sequence
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
+from typing import Sequence as SequenceType
+import numpy as np
+from PIL import Image
+from doctr.io.image import get_img_shape
+from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
+from .vocabs import VOCABS
+__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
+ImageTensor = TypeVar("ImageTensor")
+def translate(
+    input_string: str,
+    vocab_name: str,
+    unknown_char: str = "■",
+) -> str:
+    """Translate a string input in a given vocabulary
+    Args:
+    ----
+        input_string: input string to translate
+        vocab_name: vocabulary to use (french, latin, ...)
+        unknown_char: unknown character for non-translatable characters
+    Returns:
+    -------
+        A string translated in a given vocab
+    """
+    if VOCABS.get(vocab_name) is None:
+        raise KeyError("output vocabulary must be in vocabs dictionnary")
+    translated = ""
+    for char in input_string:
+        if char not in VOCABS[vocab_name]:
+            # we need to translate char into a vocab char
+            if char in string.whitespace:
+                # remove whitespaces
+                continue
+            # normalize character if it is not in vocab
+            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
+            if char == "" or char not in VOCABS[vocab_name]:
+                # if normalization fails or char still not in vocab, return unknown character)
+                char = unknown_char
+        translated += char
+    return translated
+def encode_string(
+    input_string: str,
+    vocab: str,
+) -> List[int]:
+    """Given a predefined mapping, encode the string to a sequence of numbers
+    Args:
+    ----
+        input_string: string to encode
+        vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
+    Returns:
+    -------
+        A list encoding the input_string
+    """
+    try:
+        return list(map(vocab.index, input_string))
+    except ValueError:
+        raise ValueError(
+            f"some characters cannot be found in 'vocab'. \
+                         Please check the input string {input_string} and the vocabulary {vocab}"
+        )
+def decode_sequence(
+    input_seq: Union[np.ndarray, SequenceType[int]],
+    mapping: str,
+) -> str:
+    """Given a predefined mapping, decode the sequence of numbers to a string
+    Args:
+    ----
+        input_seq: array to decode
+        mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
+    Returns:
+    -------
+        A string, decoded from input_seq
+    """
+    if not isinstance(input_seq, (Sequence, np.ndarray)):
+        raise TypeError("Invalid sequence type")
+    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
+        raise AssertionError("Input must be an array of int, with max less than mapping size")
+    return "".join(map(mapping.__getitem__, input_seq))
+def encode_sequences(
+    sequences: List[str],
+    vocab: str,
+    target_size: Optional[int] = None,
+    eos: int = -1,
+    sos: Optional[int] = None,
+    pad: Optional[int] = None,
+    dynamic_seq_length: bool = False,
+) -> np.ndarray:
+    """Encode character sequences using a given vocab as mapping
+    Args:
+    ----
+        sequences: the list of character sequences of size N
+        vocab: the ordered vocab to use for encoding
+        target_size: maximum length of the encoded data
+        eos: encoding of End Of String
+        sos: optional encoding of Start Of String
+        pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
+        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
+    Returns:
+    -------
+        the padded encoded data as a tensor
+    """
+    if 0 <= eos < len(vocab):
+        raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
+    if not isinstance(target_size, int) or dynamic_seq_length:
+        # Maximum string length + EOS
+        max_length = max(len(w) for w in sequences) + 1
+        if isinstance(sos, int):
+            max_length += 1
+        if isinstance(pad, int):
+            max_length += 1
+        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
+    # Pad all sequences
+    if isinstance(pad, int):  # pad with padding symbol
+        if 0 <= pad < len(vocab):
+            raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
+        # In that case, add EOS at the end of the word before padding
+        default_symbol = pad
+    else:  # pad with eos symbol
+        default_symbol = eos
+    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
+    # Encode the strings
+    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
+        if isinstance(pad, int):  # add eos at the end of the sequence
+            seq.append(eos)
+        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
+    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
+        if 0 <= sos < len(vocab):
+            raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
+        encoded_data = np.roll(encoded_data, 1)
+        encoded_data[:, 0] = sos
+    return encoded_data
+def convert_target_to_relative(img: ImageTensor, target: Dict[str, Any]) -> Tuple[ImageTensor, Dict[str, Any]]:
+    target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img))
+    return img, target
+def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]:
+    """Crop a set of bounding boxes from an image
+    Args:
+    ----
+        img_path: path to the image
+        geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4)
+    Returns:
+    -------
+        a list of cropped images
+    """
+    img: np.ndarray = np.array(Image.open(img_path).convert("RGB"))
+    # Polygon
+    if geoms.ndim == 3 and geoms.shape[1:] == (4, 2):
+        return extract_rcrops(img, geoms.astype(dtype=int))
+    if geoms.ndim == 2 and geoms.shape[1] == 4:
+        return extract_crops(img, geoms.astype(dtype=int))
+    raise ValueError("Invalid geometry format")
+def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]:
+    """Converts multiclass target to relative coordinates.
+    Args:
+    ----
+        img: Image
+        target: tuple of target polygons and their classes names
+    Returns:
+    -------
+        Image and dictionary of boxes, with class names as keys
+    """
+    boxes = convert_to_relative_coords(target[0], get_img_shape(img))
+    boxes_classes = target[1]
+    boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))}
+    for k, poly in zip(boxes_classes, boxes):
+        boxes_dict[k].append(poly)
+    boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()}
+    return img, boxes_dict

doctr/datasets/vocabs.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import string
+from typing import Dict
+__all__ = ["VOCABS"]
+VOCABS: Dict[str, str] = {
+    "digits": string.digits,
+    "ascii_letters": string.ascii_letters,
+    "punctuation": string.punctuation,
+    "currency": "£€¥¢฿",
+    "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ",
+    "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي",
+    "persian_letters": "پچڢڤگ",
+    "hindi_digits": "٠١٢٣٤٥٦٧٨٩",
+    "arabic_diacritics": "ًٌٍَُِّْ",
+    "arabic_punctuation": "؟؛«»—",
+}
+VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
+VOCABS["english"] = VOCABS["latin"] + "°" + VOCABS["currency"]
+VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"]
+VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ"
+VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ"
+VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿"
+VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ"
+VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ"
+VOCABS["arabic"] = (
+    VOCABS["digits"]
+    + VOCABS["hindi_digits"]
+    + VOCABS["arabic_letters"]
+    + VOCABS["persian_letters"]
+    + VOCABS["arabic_diacritics"]
+    + VOCABS["arabic_punctuation"]
+    + VOCABS["punctuation"]
+)
+VOCABS["czech"] = VOCABS["english"] + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"
+VOCABS["polish"] = VOCABS["english"] + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"
+VOCABS["dutch"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ"
+VOCABS["norwegian"] = VOCABS["english"] + "æøåÆØÅ"
+VOCABS["danish"] = VOCABS["english"] + "æøåÆØÅ"
+VOCABS["finnish"] = VOCABS["english"] + "äöÄÖ"
+VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ"
+VOCABS["vietnamese"] = (
+    VOCABS["english"]
+    + "áàảạãăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ"
+    + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
+)
+VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
+VOCABS["multilingual"] = "".join(
+    dict.fromkeys(
+        VOCABS["french"]
+        + VOCABS["portuguese"]
+        + VOCABS["spanish"]
+        + VOCABS["german"]
+        + VOCABS["czech"]
+        + VOCABS["polish"]
+        + VOCABS["dutch"]
+        + VOCABS["italian"]
+        + VOCABS["norwegian"]
+        + VOCABS["danish"]
+        + VOCABS["finnish"]
+        + VOCABS["swedish"]
+        + "§"
+    )
+)

doctr/datasets/wildreceipt.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+from .datasets import AbstractDataset
+from .utils import convert_target_to_relative, crop_bboxes_from_image
+__all__ = ["WILDRECEIPT"]
+class WILDRECEIPT(AbstractDataset):
+    """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
+        <https://arxiv.org/abs/2103.14470v1>`_ |
+    `repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
+    .. image:: https://doctr-static.mindee.com/models?id=v0.7.0/wildreceipt-dataset.jpg&src=0
+        :align: center
+    >>> # NOTE: You need to download the dataset first.
+    >>> from doctr.datasets import WILDRECEIPT
+    >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
+    >>>                     label_path="/path/to/wildreceipt/train.txt")
+    >>> img, target = train_set[0]
+    >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/",
+    >>>                    label_path="/path/to/wildreceipt/test.txt")
+    >>> img, target = test_set[0]
+    Args:
+    ----
+        img_folder: folder with all the images of the dataset
+        label_path: path to the annotations file of the dataset
+        train: whether the subset should be the training one
+        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
+        recognition_task: whether the dataset should be used for recognition task
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        use_polygons: bool = False,
+        recognition_task: bool = False,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(
+            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
+        )
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+        tmp_root = img_folder
+        self.train = train
+        np_dtype = np.float32
+        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
+        with open(label_path, "r") as file:
+            data = file.read()
+        # Split the text file into separate JSON strings
+        json_strings = data.strip().split("\n")
+        box: Union[List[float], np.ndarray]
+        _targets = []
+        for json_string in json_strings:
+            json_data = json.loads(json_string)
+            img_path = json_data["file_name"]
+            annotations = json_data["annotations"]
+            for annotation in annotations:
+                coordinates = annotation["box"]
+                if use_polygons:
+                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
+                    box = np.array(
+                        [
+                            [coordinates[0], coordinates[1]],
+                            [coordinates[2], coordinates[3]],
+                            [coordinates[4], coordinates[5]],
+                            [coordinates[6], coordinates[7]],
+                        ],
+                        dtype=np_dtype,
+                    )
+                else:
+                    x, y = coordinates[::2], coordinates[1::2]
+                    box = [min(x), min(y), max(x), max(y)]
+                _targets.append((annotation["text"], box))
+            text_targets, box_targets = zip(*_targets)
+            if recognition_task:
+                crops = crop_bboxes_from_image(
+                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+                )
+                for crop, label in zip(crops, list(text_targets)):
+                    if label and " " not in label:
+                        self.data.append((crop, label))
+            else:
+                self.data.append((
+                    img_path,
+                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
+                ))
+        self.root = tmp_root
+    def extra_repr(self) -> str:
+        return f"train={self.train}"

doctr/file_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+# Adapted from https://github.com/huggingface/transformers/blob/master/src/transformers/file_utils.py
+import importlib.util
+import logging
+import os
+import sys
+CLASS_NAME: str = "words"
+if sys.version_info < (3, 8):  # pragma: no cover
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+__all__ = ["is_tf_available", "is_torch_available", "CLASS_NAME"]
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available = importlib.util.find_spec("torch") is not None
+    if _torch_available:
+        try:
+            _torch_version = importlib_metadata.version("torch")
+            logging.info(f"PyTorch version {_torch_version} available.")
+        except importlib_metadata.PackageNotFoundError:  # pragma: no cover
+            _torch_available = False
+else:  # pragma: no cover
+    logging.info("Disabling PyTorch because USE_TF is set")
+    _torch_available = False
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    _tf_available = importlib.util.find_spec("tensorflow") is not None
+    if _tf_available:
+        candidates = (
+            "tensorflow",
+            "tensorflow-cpu",
+            "tensorflow-gpu",
+            "tf-nightly",
+            "tf-nightly-cpu",
+            "tf-nightly-gpu",
+            "intel-tensorflow",
+            "tensorflow-rocm",
+            "tensorflow-macos",
+        )
+        _tf_version = None
+        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+        for pkg in candidates:
+            try:
+                _tf_version = importlib_metadata.version(pkg)
+                break
+            except importlib_metadata.PackageNotFoundError:
+                pass
+        _tf_available = _tf_version is not None
+    if _tf_available:
+        if int(_tf_version.split(".")[0]) < 2:  # type: ignore[union-attr]  # pragma: no cover
+            logging.info(f"TensorFlow found but with version {_tf_version}. DocTR requires version 2 minimum.")
+            _tf_available = False
+        else:
+            logging.info(f"TensorFlow version {_tf_version} available.")
+else:  # pragma: no cover
+    logging.info("Disabling Tensorflow because USE_TORCH is set")
+    _tf_available = False
+if not _torch_available and not _tf_available:  # pragma: no cover
+    raise ModuleNotFoundError(
+        "DocTR requires either TensorFlow or PyTorch to be installed. Please ensure one of them"
+        " is installed and that either USE_TF or USE_TORCH is enabled."
+    )
+def is_torch_available():
+    """Whether PyTorch is installed."""
+    return _torch_available
+def is_tf_available():
+    """Whether TensorFlow is installed."""
+    return _tf_available

doctr/io/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .elements import *
+from .html import *
+from .image import *
+from .pdf import *
+from .reader import *

doctr/io/elements.py ADDED Viewed

	@@ -0,0 +1,621 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any, Dict, List, Optional, Tuple, Union
+from defusedxml import defuse_stdlib
+defuse_stdlib()
+from xml.etree import ElementTree as ET
+from xml.etree.ElementTree import Element as ETElement
+from xml.etree.ElementTree import SubElement
+import matplotlib.pyplot as plt
+import numpy as np
+import doctr
+from doctr.utils.common_types import BoundingBox
+from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
+from doctr.utils.repr import NestedObject
+from doctr.utils.visualization import synthesize_kie_page, synthesize_page, visualize_kie_page, visualize_page
+__all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page", "KIEPage", "Document"]
+class Element(NestedObject):
+    """Implements an abstract document element with exporting and text rendering capabilities"""
+    _children_names: List[str] = []
+    _exported_keys: List[str] = []
+    def __init__(self, **kwargs: Any) -> None:
+        for k, v in kwargs.items():
+            if k in self._children_names:
+                setattr(self, k, v)
+            else:
+                raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
+    def export(self) -> Dict[str, Any]:
+        """Exports the object into a nested dict format"""
+        export_dict = {k: getattr(self, k) for k in self._exported_keys}
+        for children_name in self._children_names:
+            if children_name in ["predictions"]:
+                export_dict[children_name] = {
+                    k: [item.export() for item in c] for k, c in getattr(self, children_name).items()
+                }
+            else:
+                export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
+        return export_dict
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        raise NotImplementedError
+    def render(self) -> str:
+        raise NotImplementedError
+class Word(Element):
+    """Implements a word element
+    Args:
+    ----
+        value: the text string of the word
+        confidence: the confidence associated with the text prediction
+        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
+        the page's size
+        crop_orientation: the general orientation of the crop in degrees and its confidence
+    """
+    _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
+    _children_names: List[str] = []
+    def __init__(
+        self,
+        value: str,
+        confidence: float,
+        geometry: Union[BoundingBox, np.ndarray],
+        crop_orientation: Dict[str, Any],
+    ) -> None:
+        super().__init__()
+        self.value = value
+        self.confidence = confidence
+        self.geometry = geometry
+        self.crop_orientation = crop_orientation
+    def render(self) -> str:
+        """Renders the full text of the element"""
+        return self.value
+    def extra_repr(self) -> str:
+        return f"value='{self.value}', confidence={self.confidence:.2}"
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        return cls(**kwargs)
+class Artefact(Element):
+    """Implements a non-textual element
+    Args:
+    ----
+        artefact_type: the type of artefact
+        confidence: the confidence of the type prediction
+        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
+            the page's size.
+    """
+    _exported_keys: List[str] = ["geometry", "type", "confidence"]
+    _children_names: List[str] = []
+    def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
+        super().__init__()
+        self.geometry = geometry
+        self.type = artefact_type
+        self.confidence = confidence
+    def render(self) -> str:
+        """Renders the full text of the element"""
+        return f"[{self.type.upper()}]"
+    def extra_repr(self) -> str:
+        return f"type='{self.type}', confidence={self.confidence:.2}"
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        return cls(**kwargs)
+class Line(Element):
+    """Implements a line element as a collection of words
+    Args:
+    ----
+        words: list of word elements
+        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
+            the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
+            all words in it.
+    """
+    _exported_keys: List[str] = ["geometry"]
+    _children_names: List[str] = ["words"]
+    words: List[Word] = []
+    def __init__(
+        self,
+        words: List[Word],
+        geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
+    ) -> None:
+        # Resolve the geometry using the smallest enclosing bounding box
+        if geometry is None:
+            # Check whether this is a rotated or straight box
+            box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
+            geometry = box_resolution_fn([w.geometry for w in words])  # type: ignore[operator]
+        super().__init__(words=words)
+        self.geometry = geometry
+    def render(self) -> str:
+        """Renders the full text of the element"""
+        return " ".join(w.render() for w in self.words)
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        kwargs.update({
+            "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
+        })
+        return cls(**kwargs)
+class Prediction(Word):
+    """Implements a prediction element"""
+    def render(self) -> str:
+        """Renders the full text of the element"""
+        return self.value
+    def extra_repr(self) -> str:
+        return f"value='{self.value}', confidence={self.confidence:.2}, bounding_box={self.geometry}"
+class Block(Element):
+    """Implements a block element as a collection of lines and artefacts
+    Args:
+    ----
+        lines: list of line elements
+        artefacts: list of artefacts
+        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
+            the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
+            all lines and artefacts in it.
+    """
+    _exported_keys: List[str] = ["geometry"]
+    _children_names: List[str] = ["lines", "artefacts"]
+    lines: List[Line] = []
+    artefacts: List[Artefact] = []
+    def __init__(
+        self,
+        lines: List[Line] = [],
+        artefacts: List[Artefact] = [],
+        geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
+    ) -> None:
+        # Resolve the geometry using the smallest enclosing bounding box
+        if geometry is None:
+            line_boxes = [word.geometry for line in lines for word in line.words]
+            artefact_boxes = [artefact.geometry for artefact in artefacts]
+            box_resolution_fn = (
+                resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
+            )
+            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator]
+        super().__init__(lines=lines, artefacts=artefacts)
+        self.geometry = geometry
+    def render(self, line_break: str = "\n") -> str:
+        """Renders the full text of the element"""
+        return line_break.join(line.render() for line in self.lines)
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        kwargs.update({
+            "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
+            "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
+        })
+        return cls(**kwargs)
+class Page(Element):
+    """Implements a page element as a collection of blocks
+    Args:
+    ----
+        page: image encoded as a numpy array in uint8
+        blocks: list of block elements
+        page_idx: the index of the page in the input raw document
+        dimensions: the page size in pixels in format (height, width)
+        orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
+        language: a dictionary with the language value and confidence of the prediction
+    """
+    _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
+    _children_names: List[str] = ["blocks"]
+    blocks: List[Block] = []
+    def __init__(
+        self,
+        page: np.ndarray,
+        blocks: List[Block],
+        page_idx: int,
+        dimensions: Tuple[int, int],
+        orientation: Optional[Dict[str, Any]] = None,
+        language: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        super().__init__(blocks=blocks)
+        self.page = page
+        self.page_idx = page_idx
+        self.dimensions = dimensions
+        self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
+        self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
+    def render(self, block_break: str = "\n\n") -> str:
+        """Renders the full text of the element"""
+        return block_break.join(b.render() for b in self.blocks)
+    def extra_repr(self) -> str:
+        return f"dimensions={self.dimensions}"
+    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
+        """Overlay the result on a given image
+        Args:
+            interactive: whether the display should be interactive
+            preserve_aspect_ratio: pass True if you passed True to the predictor
+            **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method
+        """
+        visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
+        plt.show(**kwargs)
+    def synthesize(self, **kwargs) -> np.ndarray:
+        """Synthesize the page from the predictions
+        Returns
+        -------
+            synthesized page
+        """
+        return synthesize_page(self.export(), **kwargs)
+    def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
+        """Export the page as XML (hOCR-format)
+        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
+        Args:
+        ----
+            file_title: the title of the XML file
+        Returns:
+        -------
+            a tuple of the XML byte string, and its ElementTree
+        """
+        p_idx = self.page_idx
+        block_count: int = 1
+        line_count: int = 1
+        word_count: int = 1
+        height, width = self.dimensions
+        language = self.language if "language" in self.language.keys() else "en"
+        # Create the XML root element
+        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
+        # Create the header / SubElements of the root element
+        head = SubElement(page_hocr, "head")
+        SubElement(head, "title").text = file_title
+        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
+        SubElement(
+            head,
+            "meta",
+            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
+        )
+        SubElement(
+            head,
+            "meta",
+            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
+        )
+        # Create the body
+        body = SubElement(page_hocr, "body")
+        SubElement(
+            body,
+            "div",
+            attrib={
+                "class": "ocr_page",
+                "id": f"page_{p_idx + 1}",
+                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
+            },
+        )
+        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
+        for block in self.blocks:
+            if len(block.geometry) != 2:
+                raise TypeError("XML export is only available for straight bounding boxes for now.")
+            (xmin, ymin), (xmax, ymax) = block.geometry
+            block_div = SubElement(
+                body,
+                "div",
+                attrib={
+                    "class": "ocr_carea",
+                    "id": f"block_{block_count}",
+                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                    {int(round(xmax * width))} {int(round(ymax * height))}",
+                },
+            )
+            paragraph = SubElement(
+                block_div,
+                "p",
+                attrib={
+                    "class": "ocr_par",
+                    "id": f"par_{block_count}",
+                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                    {int(round(xmax * width))} {int(round(ymax * height))}",
+                },
+            )
+            block_count += 1
+            for line in block.lines:
+                (xmin, ymin), (xmax, ymax) = line.geometry
+                # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
+                line_span = SubElement(
+                    paragraph,
+                    "span",
+                    attrib={
+                        "class": "ocr_line",
+                        "id": f"line_{line_count}",
+                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                        {int(round(xmax * width))} {int(round(ymax * height))}; \
+                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
+                    },
+                )
+                line_count += 1
+                for word in line.words:
+                    (xmin, ymin), (xmax, ymax) = word.geometry
+                    conf = word.confidence
+                    word_div = SubElement(
+                        line_span,
+                        "span",
+                        attrib={
+                            "class": "ocrx_word",
+                            "id": f"word_{word_count}",
+                            "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                            {int(round(xmax * width))} {int(round(ymax * height))}; \
+                            x_wconf {int(round(conf * 100))}",
+                        },
+                    )
+                    # set the text
+                    word_div.text = word.value
+                    word_count += 1
+        return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
+        return cls(**kwargs)
+class KIEPage(Element):
+    """Implements a KIE page element as a collection of predictions
+    Args:
+    ----
+        predictions: Dictionary with list of block elements for each detection class
+        page: image encoded as a numpy array in uint8
+        page_idx: the index of the page in the input raw document
+        dimensions: the page size in pixels in format (height, width)
+        orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
+        language: a dictionary with the language value and confidence of the prediction
+    """
+    _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
+    _children_names: List[str] = ["predictions"]
+    predictions: Dict[str, List[Prediction]] = {}
+    def __init__(
+        self,
+        page: np.ndarray,
+        predictions: Dict[str, List[Prediction]],
+        page_idx: int,
+        dimensions: Tuple[int, int],
+        orientation: Optional[Dict[str, Any]] = None,
+        language: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        super().__init__(predictions=predictions)
+        self.page = page
+        self.page_idx = page_idx
+        self.dimensions = dimensions
+        self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
+        self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
+    def render(self, prediction_break: str = "\n\n") -> str:
+        """Renders the full text of the element"""
+        return prediction_break.join(
+            f"{class_name}: {p.render()}" for class_name, predictions in self.predictions.items() for p in predictions
+        )
+    def extra_repr(self) -> str:
+        return f"dimensions={self.dimensions}"
+    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
+        """Overlay the result on a given image
+        Args:
+            interactive: whether the display should be interactive
+            preserve_aspect_ratio: pass True if you passed True to the predictor
+            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
+        """
+        visualize_kie_page(
+            self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
+        )
+        plt.show(**kwargs)
+    def synthesize(self, **kwargs) -> np.ndarray:
+        """Synthesize the page from the predictions
+        Args:
+        ----
+            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
+        Returns:
+        -------
+            synthesized page
+        """
+        return synthesize_kie_page(self.export(), **kwargs)
+    def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
+        """Export the page as XML (hOCR-format)
+        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
+        Args:
+        ----
+            file_title: the title of the XML file
+        Returns:
+        -------
+            a tuple of the XML byte string, and its ElementTree
+        """
+        p_idx = self.page_idx
+        prediction_count: int = 1
+        height, width = self.dimensions
+        language = self.language if "language" in self.language.keys() else "en"
+        # Create the XML root element
+        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
+        # Create the header / SubElements of the root element
+        head = SubElement(page_hocr, "head")
+        SubElement(head, "title").text = file_title
+        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
+        SubElement(
+            head,
+            "meta",
+            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
+        )
+        SubElement(
+            head,
+            "meta",
+            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
+        )
+        # Create the body
+        body = SubElement(page_hocr, "body")
+        SubElement(
+            body,
+            "div",
+            attrib={
+                "class": "ocr_page",
+                "id": f"page_{p_idx + 1}",
+                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
+            },
+        )
+        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
+        for class_name, predictions in self.predictions.items():
+            for prediction in predictions:
+                if len(prediction.geometry) != 2:
+                    raise TypeError("XML export is only available for straight bounding boxes for now.")
+                (xmin, ymin), (xmax, ymax) = prediction.geometry
+                prediction_div = SubElement(
+                    body,
+                    "div",
+                    attrib={
+                        "class": "ocr_carea",
+                        "id": f"{class_name}_prediction_{prediction_count}",
+                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
+                        {int(round(xmax * width))} {int(round(ymax * height))}",
+                    },
+                )
+                prediction_div.text = prediction.value
+                prediction_count += 1
+        return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        kwargs.update({
+            "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
+        })
+        return cls(**kwargs)
+class Document(Element):
+    """Implements a document element as a collection of pages
+    Args:
+    ----
+        pages: list of page elements
+    """
+    _children_names: List[str] = ["pages"]
+    pages: List[Page] = []
+    def __init__(
+        self,
+        pages: List[Page],
+    ) -> None:
+        super().__init__(pages=pages)
+    def render(self, page_break: str = "\n\n\n\n") -> str:
+        """Renders the full text of the element"""
+        return page_break.join(p.render() for p in self.pages)
+    def show(self, **kwargs) -> None:
+        """Overlay the result on a given image"""
+        for result in self.pages:
+            result.show(**kwargs)
+    def synthesize(self, **kwargs) -> List[np.ndarray]:
+        """Synthesize all pages from their predictions
+        Returns
+        -------
+            list of synthesized pages
+        """
+        return [page.synthesize() for page in self.pages]
+    def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
+        """Export the document as XML (hOCR-format)
+        Args:
+        ----
+            **kwargs: additional keyword arguments passed to the Page.export_as_xml method
+        Returns:
+        -------
+            list of tuple of (bytes, ElementTree)
+        """
+        return [page.export_as_xml(**kwargs) for page in self.pages]
+    @classmethod
+    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
+        kwargs = {k: save_dict[k] for k in cls._exported_keys}
+        kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
+        return cls(**kwargs)
+class KIEDocument(Document):
+    """Implements a document element as a collection of pages
+    Args:
+    ----
+        pages: list of page elements
+    """
+    _children_names: List[str] = ["pages"]
+    pages: List[KIEPage] = []  # type: ignore[assignment]
+    def __init__(
+        self,
+        pages: List[KIEPage],
+    ) -> None:
+        super().__init__(pages=pages)  # type: ignore[arg-type]

doctr/io/html.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any
+from weasyprint import HTML
+__all__ = ["read_html"]
+def read_html(url: str, **kwargs: Any) -> bytes:
+    """Read a PDF file and convert it into an image in numpy format
+    >>> from doctr.io import read_html
+    >>> doc = read_html("https://www.yoursite.com")
+    Args:
+    ----
+        url: URL of the target web page
+        **kwargs: keyword arguments from `weasyprint.HTML`
+    Returns:
+    -------
+        decoded PDF file as a bytes stream
+    """
+    return HTML(url, **kwargs).write_pdf()

doctr/io/image/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from doctr.file_utils import is_tf_available, is_torch_available
+from .base import *
+if is_tf_available():
+    from .tensorflow import *
+elif is_torch_available():
+    from .pytorch import *

doctr/io/image/base.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from pathlib import Path
+from typing import Optional, Tuple
+import cv2
+import numpy as np
+from doctr.utils.common_types import AbstractFile
+__all__ = ["read_img_as_numpy"]
+def read_img_as_numpy(
+    file: AbstractFile,
+    output_size: Optional[Tuple[int, int]] = None,
+    rgb_output: bool = True,
+) -> np.ndarray:
+    """Read an image file into numpy format
+    >>> from doctr.io import read_img_as_numpy
+    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
+    Args:
+    ----
+        file: the path to the image file
+        output_size: the expected output size of each page in format H x W
+        rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
+    Returns:
+    -------
+        the page decoded as numpy ndarray of shape H x W x 3
+    """
+    if isinstance(file, (str, Path)):
+        if not Path(file).is_file():
+            raise FileNotFoundError(f"unable to access {file}")
+        img = cv2.imread(str(file), cv2.IMREAD_COLOR)
+    elif isinstance(file, bytes):
+        _file: np.ndarray = np.frombuffer(file, np.uint8)
+        img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
+    else:
+        raise TypeError("unsupported object type for argument 'file'")
+    # Validity check
+    if img is None:
+        raise ValueError("unable to read file.")
+    # Resizing
+    if isinstance(output_size, tuple):
+        img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
+    # Switch the channel order
+    if rgb_output:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img

doctr/io/image/pytorch.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from io import BytesIO
+from typing import Tuple
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.transforms.functional import to_tensor
+from doctr.utils.common_types import AbstractPath
+__all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
+def tensor_from_pil(pil_img: Image.Image, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Convert a PIL Image to a PyTorch tensor
+    Args:
+    ----
+        pil_img: a PIL image
+        dtype: the output tensor data type
+    Returns:
+    -------
+        decoded image as tensor
+    """
+    if dtype == torch.float32:
+        img = to_tensor(pil_img)
+    else:
+        img = tensor_from_numpy(np.array(pil_img, np.uint8, copy=True), dtype)
+    return img
+def read_img_as_tensor(img_path: AbstractPath, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Read an image file as a PyTorch tensor
+    Args:
+    ----
+        img_path: location of the image file
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        decoded image as a tensor
+    """
+    if dtype not in (torch.uint8, torch.float16, torch.float32):
+        raise ValueError("insupported value for dtype")
+    pil_img = Image.open(img_path, mode="r").convert("RGB")
+    return tensor_from_pil(pil_img, dtype)
+def decode_img_as_tensor(img_content: bytes, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Read a byte stream as a PyTorch tensor
+    Args:
+    ----
+        img_content: bytes of a decoded image
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        decoded image as a tensor
+    """
+    if dtype not in (torch.uint8, torch.float16, torch.float32):
+        raise ValueError("insupported value for dtype")
+    pil_img = Image.open(BytesIO(img_content), mode="r").convert("RGB")
+    return tensor_from_pil(pil_img, dtype)
+def tensor_from_numpy(npy_img: np.ndarray, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """Read an image file as a PyTorch tensor
+    Args:
+    ----
+        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        same image as a tensor of shape (C, H, W)
+    """
+    if dtype not in (torch.uint8, torch.float16, torch.float32):
+        raise ValueError("insupported value for dtype")
+    if dtype == torch.float32:
+        img = to_tensor(npy_img)
+    else:
+        img = torch.from_numpy(npy_img)
+        # put it from HWC to CHW format
+        img = img.permute((2, 0, 1)).contiguous()
+        if dtype == torch.float16:
+            # Switch to FP16
+            img = img.to(dtype=torch.float16).div(255)
+    return img
+def get_img_shape(img: torch.Tensor) -> Tuple[int, int]:
+    """Get the shape of an image"""
+    return img.shape[-2:]  # type: ignore[return-value]

doctr/io/image/tensorflow.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Tuple
+import numpy as np
+import tensorflow as tf
+from PIL import Image
+from tensorflow.keras.utils import img_to_array
+from doctr.utils.common_types import AbstractPath
+__all__ = ["tensor_from_pil", "read_img_as_tensor", "decode_img_as_tensor", "tensor_from_numpy", "get_img_shape"]
+def tensor_from_pil(pil_img: Image.Image, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+    """Convert a PIL Image to a TensorFlow tensor
+    Args:
+    ----
+        pil_img: a PIL image
+        dtype: the output tensor data type
+    Returns:
+    -------
+        decoded image as tensor
+    """
+    npy_img = img_to_array(pil_img)
+    return tensor_from_numpy(npy_img, dtype)
+def read_img_as_tensor(img_path: AbstractPath, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+    """Read an image file as a TensorFlow tensor
+    Args:
+    ----
+        img_path: location of the image file
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        decoded image as a tensor
+    """
+    if dtype not in (tf.uint8, tf.float16, tf.float32):
+        raise ValueError("insupported value for dtype")
+    img = tf.io.read_file(img_path)
+    img = tf.image.decode_jpeg(img, channels=3)
+    if dtype != tf.uint8:
+        img = tf.image.convert_image_dtype(img, dtype=dtype)
+        img = tf.clip_by_value(img, 0, 1)
+    return img
+def decode_img_as_tensor(img_content: bytes, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+    """Read a byte stream as a TensorFlow tensor
+    Args:
+    ----
+        img_content: bytes of a decoded image
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        decoded image as a tensor
+    """
+    if dtype not in (tf.uint8, tf.float16, tf.float32):
+        raise ValueError("insupported value for dtype")
+    img = tf.io.decode_image(img_content, channels=3)
+    if dtype != tf.uint8:
+        img = tf.image.convert_image_dtype(img, dtype=dtype)
+        img = tf.clip_by_value(img, 0, 1)
+    return img
+def tensor_from_numpy(npy_img: np.ndarray, dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+    """Read an image file as a TensorFlow tensor
+    Args:
+    ----
+        npy_img: image encoded as a numpy array of shape (H, W, C) in np.uint8
+        dtype: the desired data type of the output tensor. If it is float-related, values will be divided by 255.
+    Returns:
+    -------
+        same image as a tensor of shape (H, W, C)
+    """
+    if dtype not in (tf.uint8, tf.float16, tf.float32):
+        raise ValueError("insupported value for dtype")
+    if dtype == tf.uint8:
+        img = tf.convert_to_tensor(npy_img, dtype=dtype)
+    else:
+        img = tf.image.convert_image_dtype(npy_img, dtype=dtype)
+        img = tf.clip_by_value(img, 0, 1)
+    return img
+def get_img_shape(img: tf.Tensor) -> Tuple[int, int]:
+    """Get the shape of an image"""
+    return img.shape[:2]

doctr/io/pdf.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from typing import Any, List, Optional
+import numpy as np
+import pypdfium2 as pdfium
+from doctr.utils.common_types import AbstractFile
+__all__ = ["read_pdf"]
+def read_pdf(
+    file: AbstractFile,
+    scale: float = 2,
+    rgb_mode: bool = True,
+    password: Optional[str] = None,
+    **kwargs: Any,
+) -> List[np.ndarray]:
+    """Read a PDF file and convert it into an image in numpy format
+    >>> from doctr.io import read_pdf
+    >>> doc = read_pdf("path/to/your/doc.pdf")
+    Args:
+    ----
+        file: the path to the PDF file
+        scale: rendering scale (1 corresponds to 72dpi)
+        rgb_mode: if True, the output will be RGB, otherwise BGR
+        password: a password to unlock the document, if encrypted
+        **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
+    Returns:
+    -------
+        the list of pages decoded as numpy ndarray of shape H x W x C
+    """
+    # Rasterise pages to numpy ndarrays with pypdfium2
+    pdf = pdfium.PdfDocument(file, password=password, autoclose=True)
+    return [page.render(scale=scale, rev_byteorder=rgb_mode, **kwargs).to_numpy() for page in pdf]

doctr/io/reader.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (C) 2021-2024, Mindee.
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+from pathlib import Path
+from typing import List, Sequence, Union
+import numpy as np
+from doctr.utils.common_types import AbstractFile
+from .html import read_html
+from .image import read_img_as_numpy
+from .pdf import read_pdf
+__all__ = ["DocumentFile"]
+class DocumentFile:
+    """Read a document from multiple extensions"""
+    @classmethod
+    def from_pdf(cls, file: AbstractFile, **kwargs) -> List[np.ndarray]:
+        """Read a PDF file
+        >>> from doctr.io import DocumentFile
+        >>> doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
+        Args:
+        ----
+            file: the path to the PDF file or a binary stream
+            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
+        Returns:
+        -------
+            the list of pages decoded as numpy ndarray of shape H x W x 3
+        """
+        return read_pdf(file, **kwargs)
+    @classmethod
+    def from_url(cls, url: str, **kwargs) -> List[np.ndarray]:
+        """Interpret a web page as a PDF document
+        >>> from doctr.io import DocumentFile
+        >>> doc = DocumentFile.from_url("https://www.yoursite.com")
+        Args:
+        ----
+            url: the URL of the target web page
+            **kwargs: additional parameters to :meth:`pypdfium2.PdfPage.render`
+        Returns:
+        -------
+            the list of pages decoded as numpy ndarray of shape H x W x 3
+        """
+        pdf_stream = read_html(url)
+        return cls.from_pdf(pdf_stream, **kwargs)
+    @classmethod
+    def from_images(cls, files: Union[Sequence[AbstractFile], AbstractFile], **kwargs) -> List[np.ndarray]:
+        """Read an image file (or a collection of image files) and convert it into an image in numpy format
+        >>> from doctr.io import DocumentFile
+        >>> pages = DocumentFile.from_images(["path/to/your/page1.png", "path/to/your/page2.png"])
+        Args:
+        ----
+            files: the path to the image file or a binary stream, or a collection of those
+            **kwargs: additional parameters to :meth:`doctr.io.image.read_img_as_numpy`
+        Returns:
+        -------
+            the list of pages decoded as numpy ndarray of shape H x W x 3
+        """
+        if isinstance(files, (str, Path, bytes)):
+            files = [files]
+        return [read_img_as_numpy(file, **kwargs) for file in files]