Spaces:

legacies
/

doctr

Runtime error

App Files Files Community

legacies commited on May 29, 2024

Commit

7d98805

1 Parent(s): df8e96c

readme files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -10
.gitignore +0 -140
.pre-commit-config.yaml +0 -23
CODE_OF_CONDUCT.md +0 -128
CONTRIBUTING.md +0 -92
Dockerfile +0 -75
LICENSE +0 -201
Makefile +0 -33
README.md +34 -5
README1.md +0 -384
main.py → app.py +0 -4
backend/__pycache__/pytorch.cpython-312.pyc +0 -0
backend/pytorch.py +0 -5
doctr/__init__.py +0 -3
doctr/datasets/__init__.py +0 -26
doctr/datasets/cord.py +0 -121
doctr/datasets/datasets/__init__.py +0 -6
doctr/datasets/datasets/base.py +0 -132
doctr/datasets/datasets/pytorch.py +0 -59
doctr/datasets/datasets/tensorflow.py +0 -59
doctr/datasets/detection.py +0 -98
doctr/datasets/doc_artefacts.py +0 -82
doctr/datasets/funsd.py +0 -112
doctr/datasets/generator/__init__.py +0 -6
doctr/datasets/generator/base.py +0 -155
doctr/datasets/generator/pytorch.py +0 -54
doctr/datasets/generator/tensorflow.py +0 -60
doctr/datasets/ic03.py +0 -126
doctr/datasets/ic13.py +0 -99
doctr/datasets/iiit5k.py +0 -103
doctr/datasets/iiithws.py +0 -75
doctr/datasets/imgur5k.py +0 -147
doctr/datasets/loader.py +0 -102
doctr/datasets/mjsynth.py +0 -106
doctr/datasets/ocr.py +0 -71
doctr/datasets/orientation.py +0 -40
doctr/datasets/recognition.py +0 -56
doctr/datasets/sroie.py +0 -103
doctr/datasets/svhn.py +0 -131
doctr/datasets/svt.py +0 -117
doctr/datasets/synthtext.py +0 -128
doctr/datasets/utils.py +0 -216
doctr/datasets/vocabs.py +0 -71
doctr/datasets/wildreceipt.py +0 -111
doctr/file_utils.py +0 -92
doctr/io/__init__.py +0 -5
doctr/io/elements.py +0 -621
doctr/io/html.py +0 -28
doctr/io/image/__init__.py +0 -8
doctr/io/image/base.py +0 -56

.gitattributes CHANGED Viewed

@@ -1,35 +1,27 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text
 *.ftz filter=lfs diff=lfs merge=lfs -text
 *.gz filter=lfs diff=lfs merge=lfs -text
 *.h5 filter=lfs diff=lfs merge=lfs -text
 *.joblib filter=lfs diff=lfs merge=lfs -text
 *.lfs.* filter=lfs diff=lfs merge=lfs -text
 *.model filter=lfs diff=lfs merge=lfs -text
 *.msgpack filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,140 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-.python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# Temp files
-doctr/version.py
-logs/
-wandb/
-.idea/
-# Checkpoints
-*.pt
-*.pb
-*.index

.pre-commit-config.yaml DELETED Viewed

@@ -1,23 +0,0 @@
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
-    hooks:
-      - id: check-ast
-      - id: check-yaml
-        exclude: .conda
-      - id: check-toml
-      - id: check-json
-      - id: check-added-large-files
-        exclude: docs/images/
-      - id: end-of-file-fixer
-      - id: trailing-whitespace
-      - id: debug-statements
-      - id: check-merge-conflict
-      - id: no-commit-to-branch
-        args: ['--branch', 'main']
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
-    hooks:
-      - id: ruff
-        args: [ --fix ]
-      - id: ruff-format

CODE_OF_CONDUCT.md DELETED Viewed

@@ -1,128 +0,0 @@
-# Contributor Covenant Code of Conduct
-## Our Pledge
-We as members, contributors, and leaders pledge to make participation in our
-community a harassment-free experience for everyone, regardless of age, body
-size, visible or invisible disability, ethnicity, sex characteristics, gender
-identity and expression, level of experience, education, socio-economic status,
-nationality, personal appearance, race, religion, or sexual identity
-and orientation.
-We pledge to act and interact in ways that contribute to an open, welcoming,
-diverse, inclusive, and healthy community.
-## Our Standards
-Examples of behavior that contributes to a positive environment for our
-community include:
-* Demonstrating empathy and kindness toward other people
-* Being respectful of differing opinions, viewpoints, and experiences
-* Giving and gracefully accepting constructive feedback
-* Accepting responsibility and apologizing to those affected by our mistakes,
-  and learning from the experience
-* Focusing on what is best not just for us as individuals, but for the
-  overall community
-Examples of unacceptable behavior include:
-* The use of sexualized language or imagery, and sexual attention or
-  advances of any kind
-* Trolling, insulting or derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or email
-  address, without their explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
-  professional setting
-## Enforcement Responsibilities
-Community leaders are responsible for clarifying and enforcing our standards of
-acceptable behavior and will take appropriate and fair corrective action in
-response to any behavior that they deem inappropriate, threatening, offensive,
-or harmful.
-Community leaders have the right and responsibility to remove, edit, or reject
-comments, commits, code, wiki edits, issues, and other contributions that are
-not aligned to this Code of Conduct, and will communicate reasons for moderation
-decisions when appropriate.
-## Scope
-This Code of Conduct applies within all community spaces, and also applies when
-an individual is officially representing the community in public spaces.
-Examples of representing our community include using an official e-mail address,
-posting via an official social media account, or acting as an appointed
-representative at an online or offline event.
-## Enforcement
-Instances of abusive, harassing, or otherwise unacceptable behavior may be
-reported to the community leaders responsible for enforcement at
-contact@mindee.com.
-All complaints will be reviewed and investigated promptly and fairly.
-All community leaders are obligated to respect the privacy and security of the
-reporter of any incident.
-## Enforcement Guidelines
-Community leaders will follow these Community Impact Guidelines in determining
-the consequences for any action they deem in violation of this Code of Conduct:
-### 1. Correction
-**Community Impact**: Use of inappropriate language or other behavior deemed
-unprofessional or unwelcome in the community.
-**Consequence**: A private, written warning from community leaders, providing
-clarity around the nature of the violation and an explanation of why the
-behavior was inappropriate. A public apology may be requested.
-### 2. Warning
-**Community Impact**: A violation through a single incident or series
-of actions.
-**Consequence**: A warning with consequences for continued behavior. No
-interaction with the people involved, including unsolicited interaction with
-those enforcing the Code of Conduct, for a specified period of time. This
-includes avoiding interactions in community spaces as well as external channels
-like social media. Violating these terms may lead to a temporary or
-permanent ban.
-### 3. Temporary Ban
-**Community Impact**: A serious violation of community standards, including
-sustained inappropriate behavior.
-**Consequence**: A temporary ban from any sort of interaction or public
-communication with the community for a specified period of time. No public or
-private interaction with the people involved, including unsolicited interaction
-with those enforcing the Code of Conduct, is allowed during this period.
-Violating these terms may lead to a permanent ban.
-### 4. Permanent Ban
-**Community Impact**: Demonstrating a pattern of violation of community
-standards, including sustained inappropriate behavior,  harassment of an
-individual, or aggression toward or disparagement of classes of individuals.
-**Consequence**: A permanent ban from any sort of public interaction within
-the community.
-## Attribution
-This Code of Conduct is adapted from the [Contributor Covenant][homepage],
-version 2.0, available at
-https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
-Community Impact Guidelines were inspired by [Mozilla's code of conduct
-enforcement ladder](https://github.com/mozilla/diversity).
-[homepage]: https://www.contributor-covenant.org
-For answers to common questions about this code of conduct, see the FAQ at
-https://www.contributor-covenant.org/faq. Translations are available at
-https://www.contributor-covenant.org/translations.

CONTRIBUTING.md DELETED Viewed

@@ -1,92 +0,0 @@
-# Contributing to docTR
-Everything you need to know to contribute efficiently to the project.
-## Codebase structure
-- [doctr](https://github.com/mindee/doctr/blob/main/doctr) - The package codebase
-- [tests](https://github.com/mindee/doctr/blob/main/tests) - Python unit tests
-- [docs](https://github.com/mindee/doctr/blob/main/docs) - Library documentation building
-- [scripts](https://github.com/mindee/doctr/blob/main/scripts) - Example scripts
-- [references](https://github.com/mindee/doctr/blob/main/references) - Reference training scripts
-- [demo](https://github.com/mindee/doctr/blob/main/demo) - Small demo app to showcase docTR capabilities
-- [api](https://github.com/mindee/doctr/blob/main/api) - A minimal template to deploy a REST API with docTR
-## Continuous Integration
-This project uses the following integrations to ensure proper codebase maintenance:
-- [Github Worklow](https://help.github.com/en/actions/configuring-and-managing-workflows/configuring-a-workflow) - run jobs for package build and coverage
-- [Codecov](https://codecov.io/) - reports back coverage results
-As a contributor, you will only have to ensure coverage of your code by adding appropriate unit testing of your code.
-## Feedback
-### Feature requests & bug report
-Whether you encountered a problem, or you have a feature suggestion, your input has value and can be used by contributors to reference it in their developments. For this purpose, we advise you to use Github [issues](https://github.com/mindee/doctr/issues).
-First, check whether the topic wasn't already covered in an open / closed issue. If not, feel free to open a new one! When doing so, use issue templates whenever possible and provide enough information for other contributors to jump in.
-### Questions
-If you are wondering how to do something with docTR, or a more general question, you should consider checking out Github [discussions](https://github.com/mindee/doctr/discussions). See it as a Q&A forum, or the docTR-specific StackOverflow!
-## Developing docTR
-### Developer mode installation
-Install all additional dependencies with the following command:
-```shell
-python -m pip install --upgrade pip
-pip install -e .[dev]
-pre-commit install
-```
-### Commits
-- **Code**: ensure to provide docstrings to your Python code. In doing so, please follow [Google-style](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) so it can ease the process of documentation later.
-- **Commit message**: please follow [Udacity guide](http://udacity.github.io/git-styleguide/)
-### Unit tests
-In order to run the same unit tests as the CI workflows, you can run unittests locally:
-```shell
-make test
-```
-### Code quality
-To run all quality checks together
-```shell
-make quality
-```
-#### Code style verification
-To run all style checks together
-```shell
-make style
-```
-### Modifying the documentation
-The current documentation is built using `sphinx` thanks to our CI.
-You can build the documentation locally:
-```shell
-make docs-single-version
-```
-Please note that files that have not been modified will not be rebuilt. If you want to force a complete rebuild, you can delete the `_build` directory. Additionally, you may need to clear your web browser's cache to see the modifications.
-You can now open your local version of the documentation located at `docs/_build/index.html` in your browser
-## Let's connect
-Should you wish to connect somewhere else than on GitHub, feel free to join us on [Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-uzgmljfl-MotFVfH~IdEZxjp~0zldww), where you will find a `#doctr` channel!

Dockerfile DELETED Viewed

@@ -1,75 +0,0 @@
-FROM ubuntu:22.04
-ENV DEBIAN_FRONTEND=noninteractive
-ENV LANG=C.UTF-8
-ENV PYTHONUNBUFFERED=1
-ENV PYTHONDONTWRITEBYTECODE=1
-ARG SYSTEM=gpu
-# Enroll NVIDIA GPG public key and install CUDA
-RUN if [ "$SYSTEM" = "gpu" ]; then \
-    apt-get update && \
-    apt-get install -y gnupg ca-certificates wget && \
-    # - Install Nvidia repo keys
-    # - See: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#network-repo-installation-for-ubuntu
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb && \
-    dpkg -i cuda-keyring_1.1-1_all.deb && \
-    apt-get update && apt-get install -y --no-install-recommends \
-    cuda-command-line-tools-11-8 \
-    cuda-cudart-dev-11-8 \
-    cuda-nvcc-11-8 \
-    cuda-cupti-11-8 \
-    cuda-nvprune-11-8 \
-    cuda-libraries-11-8 \
-    cuda-nvrtc-11-8 \
-    libcufft-11-8 \
-    libcurand-11-8 \
-    libcusolver-11-8 \
-    libcusparse-11-8 \
-    libcublas-11-8 \
-    # - CuDNN: https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#ubuntu-network-installation
-    libcudnn8=8.6.0.163-1+cuda11.8 \
-    libnvinfer-plugin8=8.6.1.6-1+cuda11.8 \
-    libnvinfer8=8.6.1.6-1+cuda11.8; \
-fi
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    # - Other packages
-    build-essential \
-    pkg-config \
-    curl \
-    wget \
-    software-properties-common \
-    unzip \
-    git \
-    # - Packages to build Python
-    tar make gcc zlib1g-dev libffi-dev libssl-dev liblzma-dev libbz2-dev libsqlite3-dev \
-    # - Packages for docTR
-    libgl1-mesa-dev libsm6 libxext6 libxrender-dev libpangocairo-1.0-0 \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/* \
-fi
-# Install Python
-ARG PYTHON_VERSION=3.10.13
-RUN wget http://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
-    tar -zxf Python-$PYTHON_VERSION.tgz && \
-    cd Python-$PYTHON_VERSION && \
-    mkdir /opt/python/ && \
-    ./configure --prefix=/opt/python && \
-    make && \
-    make install && \
-    cd .. && \
-    rm Python-$PYTHON_VERSION.tgz && \
-    rm -r Python-$PYTHON_VERSION
-ENV PATH=/opt/python/bin:$PATH
-# Install docTR
-ARG FRAMEWORK=tf
-ARG DOCTR_REPO='mindee/doctr'
-ARG DOCTR_VERSION=main
-RUN pip3 install -U pip setuptools wheel && \
-    pip3 install "python-doctr[$FRAMEWORK]@git+https://github.com/$DOCTR_REPO.git@$DOCTR_VERSION"

LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright 2022 Mindee
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

Makefile DELETED Viewed

@@ -1,33 +0,0 @@
-.PHONY: quality style test test-common test-tf test-torch docs-single-version docs
-# this target runs checks on all files
-quality:
-	ruff check .
-	mypy doctr/
-# this target runs checks on all files and potentially modifies some of them
-style:
-	ruff check --fix .
-	ruff format .
-# Run tests for the library
-test:
-	coverage run -m pytest tests/common/
-	USE_TF='1' coverage run -m pytest tests/tensorflow/
-	USE_TORCH='1' coverage run -m pytest tests/pytorch/
-test-common:
-	coverage run -m pytest tests/common/
-test-tf:
-	USE_TF='1' coverage run -m pytest tests/tensorflow/
-test-torch:
-	USE_TORCH='1' coverage run -m pytest tests/pytorch/
-# Check that docs can build
-docs-single-version:
-	sphinx-build docs/source docs/_build -a
-# Check that docs can build
-docs:
-	cd docs && bash build.sh

README.md CHANGED Viewed

@@ -1,10 +1,39 @@
 ---
-title: My OCR Project
-emoji: 🚀
-colorFrom: red
-colorTo: yellow
 sdk: streamlit
-sdk_version: "1.24.0"
 app_file: app.py
 pinned: false
 ---

 ---
+title: docTR
+emoji: 📑
+colorFrom: purple
+colorTo: pink
 sdk: streamlit
+sdk_version: 1.30.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio` or `streamlit`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

README1.md DELETED Viewed

@@ -1,384 +0,0 @@
-<p align="center">
-  <img src="https://github.com/mindee/doctr/raw/main/docs/images/Logo_doctr.gif" width="40%">
-</p>
-[![Slack Icon](https://img.shields.io/badge/Slack-Community-4A154B?style=flat-square&logo=slack&logoColor=white)](https://slack.mindee.com) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) ![Build Status](https://github.com/mindee/doctr/workflows/builds/badge.svg) [![Docker Images](https://img.shields.io/badge/Docker-4287f5?style=flat&logo=docker&logoColor=white)](https://github.com/mindee/doctr/pkgs/container/doctr) [![codecov](https://codecov.io/gh/mindee/doctr/branch/main/graph/badge.svg?token=577MO567NM)](https://codecov.io/gh/mindee/doctr) [![CodeFactor](https://www.codefactor.io/repository/github/mindee/doctr/badge?s=bae07db86bb079ce9d6542315b8c6e70fa708a7e)](https://www.codefactor.io/repository/github/mindee/doctr) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/340a76749b634586a498e1c0ab998f08)](https://app.codacy.com/gh/mindee/doctr?utm_source=github.com&utm_medium=referral&utm_content=mindee/doctr&utm_campaign=Badge_Grade) [![Doc Status](https://github.com/mindee/doctr/workflows/doc-status/badge.svg)](https://mindee.github.io/doctr) [![Pypi](https://img.shields.io/badge/pypi-v0.8.1-blue.svg)](https://pypi.org/project/python-doctr/) [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mindee/notebooks/blob/main/doctr/quicktour.ipynb)
-**Optical Character Recognition made seamless & accessible to anyone, powered by TensorFlow 2 & PyTorch**
-What you can expect from this repository:
-- efficient ways to parse textual information (localize and identify each word) from your documents
-- guidance on how to integrate this in your current architecture
-![OCR_example](https://github.com/mindee/doctr/raw/main/docs/images/ocr.png)
-## Quick Tour
-### Getting your pretrained model
-End-to-End OCR is achieved in docTR using a two-stage approach: text detection (localizing words), then text recognition (identify all characters in the word).
-As such, you can select the architecture used for [text detection](https://mindee.github.io/doctr/latest/modules/models.html#doctr-models-detection), and the one for [text recognition](https://mindee.github.io/doctr/latest//modules/models.html#doctr-models-recognition) from the list of available implementations.
-```python
-from doctr.models import ocr_predictor
-model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
-```
-### Reading files
-Documents can be interpreted from PDF or images:
-```python
-from doctr.io import DocumentFile
-# PDF
-pdf_doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-# Image
-single_img_doc = DocumentFile.from_images("path/to/your/img.jpg")
-# Webpage
-webpage_doc = DocumentFile.from_url("https://www.yoursite.com")
-# Multiple page images
-multi_img_doc = DocumentFile.from_images(["path/to/page1.jpg", "path/to/page2.jpg"])
-```
-### Putting it together
-Let's use the default pretrained model for an example:
-```python
-from doctr.io import DocumentFile
-from doctr.models import ocr_predictor
-model = ocr_predictor(pretrained=True)
-# PDF
-doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-# Analyze
-result = model(doc)
-```
-### Dealing with rotated documents
-Should you use docTR on documents that include rotated pages, or pages with multiple box orientations,
-you have multiple options to handle it:
-- If you only use straight document pages with straight words (horizontal, same reading direction),
-consider passing `assume_straight_boxes=True` to the ocr_predictor. It will directly fit straight boxes
-on your page and return straight boxes, which makes it the fastest option.
-- If you want the predictor to output straight boxes (no matter the orientation of your pages, the final localizations
-will be converted to straight boxes), you need to pass `export_as_straight_boxes=True` in the predictor. Otherwise, if `assume_straight_pages=False`, it will return rotated bounding boxes (potentially with an angle of 0°).
-If both options are set to False, the predictor will always fit and return rotated boxes.
-To interpret your model's predictions, you can visualize them interactively as follows:
-```python
-result.show()
-```
-![Visualization sample](https://github.com/mindee/doctr/raw/main/docs/images/doctr_example_script.gif)
-Or even rebuild the original document from its predictions:
-```python
-import matplotlib.pyplot as plt
-synthetic_pages = result.synthesize()
-plt.imshow(synthetic_pages[0]); plt.axis('off'); plt.show()
-```
-![Synthesis sample](https://github.com/mindee/doctr/raw/main/docs/images/synthesized_sample.png)
-The `ocr_predictor` returns a `Document` object with a nested structure (with `Page`, `Block`, `Line`, `Word`, `Artefact`).
-To get a better understanding of our document model, check our [documentation](https://mindee.github.io/doctr/modules/io.html#document-structure):
-You can also export them as a nested dict, more appropriate for JSON format:
-```python
-json_output = result.export()
-```
-### Use the KIE predictor
-The KIE predictor is a more flexible predictor compared to OCR as your detection model can detect multiple classes in a document. For example, you can have a detection model to detect just dates and addresses in a document.
-The KIE predictor makes it possible to use detector with multiple classes with a recognition model and to have the whole pipeline already setup for you.
-```python
-from doctr.io import DocumentFile
-from doctr.models import kie_predictor
-# Model
-model = kie_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
-# PDF
-doc = DocumentFile.from_pdf("path/to/your/doc.pdf")
-# Analyze
-result = model(doc)
-predictions = result.pages[0].predictions
-for class_name in predictions.keys():
-    list_predictions = predictions[class_name]
-    for prediction in list_predictions:
-        print(f"Prediction for {class_name}: {prediction}")
-```
-The KIE predictor results per page are in a dictionary format with each key representing a class name and it's value are the predictions for that class.
-### If you are looking for support from the Mindee team
-[![Bad OCR test detection image asking the developer if they need help](https://github.com/mindee/doctr/raw/main/docs/images/doctr-need-help.png)](https://mindee.com/product/doctr)
-## Installation
-### Prerequisites
-Python 3.9 (or higher) and [pip](https://pip.pypa.io/en/stable/) are required to install docTR.
-Since we use [weasyprint](https://weasyprint.org/), you will need extra dependencies if you are not running Linux.
-For MacOS users, you can install them as follows:
-```shell
-brew install cairo pango gdk-pixbuf libffi
-```
-For Windows users, those dependencies are included in GTK. You can find the latest installer over [here](https://github.com/tschoonj/GTK-for-Windows-Runtime-Environment-Installer/releases).
-### Latest release
-You can then install the latest release of the package using [pypi](https://pypi.org/project/python-doctr/) as follows:
-```shell
-pip install python-doctr
-```
-> :warning: Please note that the basic installation is not standalone, as it does not provide a deep learning framework, which is required for the package to run.
-We try to keep framework-specific dependencies to a minimum. You can install framework-specific builds as follows:
-```shell
-# for TensorFlow
-pip install "python-doctr[tf]"
-# for PyTorch
-pip install "python-doctr[torch]"
-```
-For MacBooks with M1 chip, you will need some additional packages or specific versions:
-- TensorFlow 2: [metal plugin](https://developer.apple.com/metal/tensorflow-plugin/)
-- PyTorch: [version >= 1.12.0](https://pytorch.org/get-started/locally/#start-locally)
-### Developer mode
-Alternatively, you can install it from source, which will require you to install [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git).
-First clone the project repository:
-```shell
-git clone https://github.com/mindee/doctr.git
-pip install -e doctr/.
-```
-Again, if you prefer to avoid the risk of missing dependencies, you can install the TensorFlow or the PyTorch build:
-```shell
-# for TensorFlow
-pip install -e doctr/.[tf]
-# for PyTorch
-pip install -e doctr/.[torch]
-```
-## Models architectures
-Credits where it's due: this repository is implementing, among others, architectures from published research papers.
-### Text Detection
-- DBNet: [Real-time Scene Text Detection with Differentiable Binarization](https://arxiv.org/pdf/1911.08947.pdf).
-- LinkNet: [LinkNet: Exploiting Encoder Representations for Efficient Semantic Segmentation](https://arxiv.org/pdf/1707.03718.pdf)
-- FAST: [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/pdf/2111.02394.pdf)
-### Text Recognition
-- CRNN: [An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition](https://arxiv.org/pdf/1507.05717.pdf).
-- SAR: [Show, Attend and Read:A Simple and Strong Baseline for Irregular Text Recognition](https://arxiv.org/pdf/1811.00751.pdf).
-- MASTER: [MASTER: Multi-Aspect Non-local Network for Scene Text Recognition](https://arxiv.org/pdf/1910.02562.pdf).
-- ViTSTR: [Vision Transformer for Fast and Efficient Scene Text Recognition](https://arxiv.org/pdf/2105.08582.pdf).
-- PARSeq: [Scene Text Recognition with Permuted Autoregressive Sequence Models](https://arxiv.org/pdf/2207.06966).
-## More goodies
-### Documentation
-The full package documentation is available [here](https://mindee.github.io/doctr/) for detailed specifications.
-### Demo app
-A minimal demo app is provided for you to play with our end-to-end OCR models!
-![Demo app](https://github.com/mindee/doctr/raw/main/docs/images/demo_update.png)
-#### Live demo
-Courtesy of :hugs: [Hugging Face](https://huggingface.co/) :hugs:, docTR has now a fully deployed version available on [Spaces](https://huggingface.co/spaces)!
-Check it out [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/mindee/doctr)
-#### Running it locally
-If you prefer to use it locally, there is an extra dependency ([Streamlit](https://streamlit.io/)) that is required.
-##### Tensorflow version
-```shell
-pip install -r demo/tf-requirements.txt
-```
-Then run your app in your default browser with:
-```shell
-USE_TF=1 streamlit run demo/app.py
-```
-##### PyTorch version
-```shell
-pip install -r demo/pt-requirements.txt
-```
-Then run your app in your default browser with:
-```shell
-USE_TORCH=1 streamlit run demo/app.py
-```
-#### TensorFlow.js
-Instead of having your demo actually running Python, you would prefer to run everything in your web browser?
-Check out our [TensorFlow.js demo](https://github.com/mindee/doctr-tfjs-demo) to get started!
-![TFJS demo](https://github.com/mindee/doctr/raw/main/docs/images/demo_illustration_mini.png)
-### Docker container
-[We offer Docker container support for easy testing and deployment](https://github.com/mindee/doctr/pkgs/container/doctr).
-#### Using GPU with docTR Docker Images
-The docTR Docker images are GPU-ready and based on CUDA `11.8`.
-However, to use GPU support with these Docker images, please ensure that Docker is configured to use your GPU.
-To verify and configure GPU support for Docker, please follow the instructions provided in the [NVIDIA Container Toolkit Installation Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
-Once Docker is configured to use GPUs, you can run docTR Docker containers with GPU support:
-```shell
-docker run -it --gpus all ghcr.io/mindee/doctr:tf-py3.8.18-gpu-2023-09 bash
-```
-#### Available Tags
-The Docker images for docTR follow a specific tag nomenclature: `<framework>-py<python_version>-<system>-<doctr_version|YYYY-MM>`. Here's a breakdown of the tag structure:
-- `<framework>`: `tf` (TensorFlow) or `torch` (PyTorch).
-- `<python_version>`: `3.8.18`, `3.9.18`, or `3.10.13`.
-- `<system>`: `cpu` or `gpu`
-- `<doctr_version>`: a tag >= `v0.7.1`
-- `<YYYY-MM>`: e.g. `2023-09`
-Here are examples of different image tags:
-| Tag                        | Description                                       |
-|----------------------------|---------------------------------------------------|
-| `tf-py3.8.18-cpu-v0.7.1`       | TensorFlow version `3.8.18` with docTR `v0.7.1`. |
-| `torch-py3.9.18-gpu-2023-09`| PyTorch version `3.9.18` with GPU support and a monthly build from `2023-09`. |
-#### Building Docker Images Locally
-You can also build docTR Docker images locally on your computer.
-```shell
-docker build -t doctr .
-```
-You can specify custom Python versions and docTR versions using build arguments. For example, to build a docTR image with TensorFlow, Python version `3.9.10`, and docTR version `v0.7.0`, run the following command:
-```shell
-docker build -t doctr --build-arg FRAMEWORK=tf --build-arg PYTHON_VERSION=3.9.10 --build-arg DOCTR_VERSION=v0.7.0 .
-```
-### Example script
-An example script is provided for a simple documentation analysis of a PDF or image file:
-```shell
-python scripts/analyze.py path/to/your/doc.pdf
-```
-All script arguments can be checked using `python scripts/analyze.py --help`
-### Minimal API integration
-Looking to integrate docTR into your API? Here is a template to get you started with a fully working API using the wonderful [FastAPI](https://github.com/tiangolo/fastapi) framework.
-#### Deploy your API locally
-Specific dependencies are required to run the API template, which you can install as follows:
-```shell
-cd api/
-pip install poetry
-make lock
-pip install -r requirements.txt
-```
-You can now run your API locally:
-```shell
-uvicorn --reload --workers 1 --host 0.0.0.0 --port=8002 --app-dir api/ app.main:app
-```
-Alternatively, you can run the same server on a docker container if you prefer using:
-```shell
-PORT=8002 docker-compose up -d --build
-```
-#### What you have deployed
-Your API should now be running locally on your port 8002. Access your automatically-built documentation at [http://localhost:8002/redoc](http://localhost:8002/redoc) and enjoy your three functional routes ("/detection", "/recognition", "/ocr", "/kie"). Here is an example with Python to send a request to the OCR route:
-```python
-import requests
-with open('/path/to/your/doc.jpg', 'rb') as f:
-    data = f.read()
-response = requests.post("http://localhost:8002/ocr", files={'file': data}).json()
-```
-### Example notebooks
-Looking for more illustrations of docTR features? You might want to check the [Jupyter notebooks](https://github.com/mindee/doctr/tree/main/notebooks) designed to give you a broader overview.
-## Citation
-If you wish to cite this project, feel free to use this [BibTeX](http://www.bibtex.org/) reference:
-```bibtex
-@misc{doctr2021,
-    title={docTR: Document Text Recognition},
-    author={Mindee},
-    year={2021},
-    publisher = {GitHub},
-    howpublished = {\url{https://github.com/mindee/doctr}}
-}
-```
-## Contributing
-If you scrolled down to this section, you most likely appreciate open source. Do you feel like extending the range of our supported characters? Or perhaps submitting a paper implementation? Or contributing in any other way?
-You're in luck, we compiled a short guide (cf. [`CONTRIBUTING`](https://mindee.github.io/doctr/contributing/contributing.html)) for you to easily do so!
-## License
-Distributed under the Apache 2.0 License. See [`LICENSE`](https://github.com/mindee/doctr?tab=Apache-2.0-1-ov-file#readme) for more information.

main.py → app.py RENAMED Viewed

@@ -1,7 +1,3 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 import cv2
 import matplotlib.pyplot as plt
 import numpy as np

 import cv2
 import matplotlib.pyplot as plt
 import numpy as np

backend/__pycache__/pytorch.cpython-312.pyc ADDED Viewed

Binary file (3.05 kB). View file

backend/pytorch.py CHANGED Viewed

@@ -16,11 +16,7 @@ DET_ARCHS = [
     "linknet_resnet18",
     "linknet_resnet34",
     "linknet_resnet50",
-    "fast_tiny",
-    "fast_small",
-    "fast_base",
 ]
 RECO_ARCHS = [
     "crnn_vgg16_bn",
     "crnn_mobilenet_v3_small",
@@ -51,7 +47,6 @@ def load_predictor(
         assume_straight_pages: whether to assume straight pages or not
         straighten_pages: whether to straighten rotated pages or not
         bin_thresh: binarization threshold for the segmentation map
-        box_thresh: minimal objectness score to consider a box
         device: torch.device, the device to load the predictor on
     Returns:

     "linknet_resnet18",
     "linknet_resnet34",
     "linknet_resnet50",
 ]
 RECO_ARCHS = [
     "crnn_vgg16_bn",
     "crnn_mobilenet_v3_small",
         assume_straight_pages: whether to assume straight pages or not
         straighten_pages: whether to straighten rotated pages or not
         bin_thresh: binarization threshold for the segmentation map
         device: torch.device, the device to load the predictor on
     Returns:

doctr/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from . import io, models, datasets, transforms, utils
-from .file_utils import is_tf_available, is_torch_available
-from .version import __version__  # noqa: F401

doctr/datasets/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-from doctr.file_utils import is_tf_available
-from .generator import *
-from .cord import *
-from .detection import *
-from .doc_artefacts import *
-from .funsd import *
-from .ic03 import *
-from .ic13 import *
-from .iiit5k import *
-from .iiithws import *
-from .imgur5k import *
-from .mjsynth import *
-from .ocr import *
-from .recognition import *
-from .orientation import *
-from .sroie import *
-from .svhn import *
-from .svt import *
-from .synthtext import *
-from .utils import *
-from .vocabs import *
-from .wildreceipt import *
-if is_tf_available():
-    from .loader import *

doctr/datasets/cord.py DELETED Viewed

@@ -1,121 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["CORD"]
-class CORD(VisionDataset):
-    """CORD dataset from `"CORD: A Consolidated Receipt Dataset forPost-OCR Parsing"
-    <https://openreview.net/pdf?id=SJl3z659UH>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/cord-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import CORD
-    >>> train_set = CORD(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    TRAIN = (
-        "https://doctr-static.mindee.com/models?id=v0.1.1/cord_train.zip&src=0",
-        "45f9dc77f126490f3e52d7cb4f70ef3c57e649ea86d19d862a2757c9c455d7f8",
-        "cord_train.zip",
-    )
-    TEST = (
-        "https://doctr-static.mindee.com/models?id=v0.1.1/cord_test.zip&src=0",
-        "8c895e3d6f7e1161c5b7245e3723ce15c04d84be89eaa6093949b75a66fb3c58",
-        "cord_test.zip",
-    )
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        url, sha256, name = self.TRAIN if train else self.TEST
-        super().__init__(
-            url,
-            name,
-            sha256,
-            True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        # List images
-        tmp_root = os.path.join(self.root, "image")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        self.train = train
-        np_dtype = np.float32
-        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking CORD", total=len(os.listdir(tmp_root))):
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, img_path)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
-            stem = Path(img_path).stem
-            _targets = []
-            with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
-                label = json.load(f)
-                for line in label["valid_line"]:
-                    for word in line["words"]:
-                        if len(word["text"]) > 0:
-                            x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
-                            y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
-                            box: Union[List[float], np.ndarray]
-                            if use_polygons:
-                                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                                box = np.array(
-                                    [
-                                        [x[0], y[0]],
-                                        [x[1], y[1]],
-                                        [x[2], y[2]],
-                                        [x[3], y[3]],
-                                    ],
-                                    dtype=np_dtype,
-                                )
-                            else:
-                                # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
-                                box = [min(x), min(y), max(x), max(y)]
-                            _targets.append((word["text"], box))
-            text_targets, box_targets = zip(*_targets)
-            if recognition_task:
-                crops = crop_bboxes_from_image(
-                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
-                )
-                for crop, label in zip(crops, list(text_targets)):
-                    self.data.append((crop, label))
-            else:
-                self.data.append((
-                    img_path,
-                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
-                ))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/datasets/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *  # type: ignore[assignment]

doctr/datasets/datasets/base.py DELETED Viewed

@@ -1,132 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-import shutil
-from pathlib import Path
-from typing import Any, Callable, List, Optional, Tuple, Union
-import numpy as np
-from doctr.io.image import get_img_shape
-from doctr.utils.data import download_from_url
-from ...models.utils import _copy_tensor
-__all__ = ["_AbstractDataset", "_VisionDataset"]
-class _AbstractDataset:
-    data: List[Any] = []
-    _pre_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None
-    def __init__(
-        self,
-        root: Union[str, Path],
-        img_transforms: Optional[Callable[[Any], Any]] = None,
-        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
-        pre_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
-    ) -> None:
-        if not Path(root).is_dir():
-            raise ValueError(f"expected a path to a reachable folder: {root}")
-        self.root = root
-        self.img_transforms = img_transforms
-        self.sample_transforms = sample_transforms
-        self._pre_transforms = pre_transforms
-        self._get_img_shape = get_img_shape
-    def __len__(self) -> int:
-        return len(self.data)
-    def _read_sample(self, index: int) -> Tuple[Any, Any]:
-        raise NotImplementedError
-    def __getitem__(self, index: int) -> Tuple[Any, Any]:
-        # Read image
-        img, target = self._read_sample(index)
-        # Pre-transforms (format conversion at run-time etc.)
-        if self._pre_transforms is not None:
-            img, target = self._pre_transforms(img, target)
-        if self.img_transforms is not None:
-            # typing issue cf. https://github.com/python/mypy/issues/5485
-            img = self.img_transforms(img)
-        if self.sample_transforms is not None:
-            # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks.
-            if (
-                isinstance(target, dict)
-                and all(isinstance(item, np.ndarray) for item in target.values())
-                and set(target.keys()) != {"boxes", "labels"}  # avoid confusion with obj detection target
-            ):
-                img_transformed = _copy_tensor(img)
-                for class_name, bboxes in target.items():
-                    img_transformed, target[class_name] = self.sample_transforms(img, bboxes)
-                img = img_transformed
-            else:
-                img, target = self.sample_transforms(img, target)
-        return img, target
-    def extra_repr(self) -> str:
-        return ""
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.extra_repr()})"
-class _VisionDataset(_AbstractDataset):
-    """Implements an abstract dataset
-    Args:
-    ----
-        url: URL of the dataset
-        file_name: name of the file once downloaded
-        file_hash: expected SHA256 of the file
-        extract_archive: whether the downloaded file is an archive to be extracted
-        download: whether the dataset should be downloaded if not present on disk
-        overwrite: whether the archive should be re-extracted
-        cache_dir: cache directory
-        cache_subdir: subfolder to use in the cache
-    """
-    def __init__(
-        self,
-        url: str,
-        file_name: Optional[str] = None,
-        file_hash: Optional[str] = None,
-        extract_archive: bool = False,
-        download: bool = False,
-        overwrite: bool = False,
-        cache_dir: Optional[str] = None,
-        cache_subdir: Optional[str] = None,
-        **kwargs: Any,
-    ) -> None:
-        cache_dir = (
-            str(os.environ.get("DOCTR_CACHE_DIR", os.path.join(os.path.expanduser("~"), ".cache", "doctr")))
-            if cache_dir is None
-            else cache_dir
-        )
-        cache_subdir = "datasets" if cache_subdir is None else cache_subdir
-        file_name = file_name if isinstance(file_name, str) else os.path.basename(url)
-        # Download the file if not present
-        archive_path: Union[str, Path] = os.path.join(cache_dir, cache_subdir, file_name)
-        if not os.path.exists(archive_path) and not download:
-            raise ValueError("the dataset needs to be downloaded first with download=True")
-        archive_path = download_from_url(url, file_name, file_hash, cache_dir=cache_dir, cache_subdir=cache_subdir)
-        # Extract the archive
-        if extract_archive:
-            archive_path = Path(archive_path)
-            dataset_path = archive_path.parent.joinpath(archive_path.stem)
-            if not dataset_path.is_dir() or overwrite:
-                shutil.unpack_archive(archive_path, dataset_path)
-        super().__init__(dataset_path if extract_archive else archive_path, **kwargs)

doctr/datasets/datasets/pytorch.py DELETED Viewed

@@ -1,59 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from copy import deepcopy
-from typing import Any, List, Tuple
-import numpy as np
-import torch
-from doctr.io import read_img_as_tensor, tensor_from_numpy
-from .base import _AbstractDataset, _VisionDataset
-__all__ = ["AbstractDataset", "VisionDataset"]
-class AbstractDataset(_AbstractDataset):
-    """Abstract class for all datasets"""
-    def _read_sample(self, index: int) -> Tuple[torch.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Check target
-        if isinstance(target, dict):
-            assert "boxes" in target, "Target should contain 'boxes' key"
-            assert "labels" in target, "Target should contain 'labels' key"
-        elif isinstance(target, tuple):
-            assert len(target) == 2
-            assert isinstance(target[0], str) or isinstance(
-                target[0], np.ndarray
-            ), "first element of the tuple should be a string or a numpy array"
-            assert isinstance(target[1], list), "second element of the tuple should be a list"
-        else:
-            assert isinstance(target, str) or isinstance(
-                target, np.ndarray
-            ), "Target should be a string or a numpy array"
-        # Read image
-        img = (
-            tensor_from_numpy(img_name, dtype=torch.float32)
-            if isinstance(img_name, np.ndarray)
-            else read_img_as_tensor(os.path.join(self.root, img_name), dtype=torch.float32)
-        )
-        return img, deepcopy(target)
-    @staticmethod
-    def collate_fn(samples: List[Tuple[torch.Tensor, Any]]) -> Tuple[torch.Tensor, List[Any]]:
-        images, targets = zip(*samples)
-        images = torch.stack(images, dim=0)  # type: ignore[assignment]
-        return images, list(targets)  # type: ignore[return-value]
-class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101
-    pass

doctr/datasets/datasets/tensorflow.py DELETED Viewed

@@ -1,59 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from copy import deepcopy
-from typing import Any, List, Tuple
-import numpy as np
-import tensorflow as tf
-from doctr.io import read_img_as_tensor, tensor_from_numpy
-from .base import _AbstractDataset, _VisionDataset
-__all__ = ["AbstractDataset", "VisionDataset"]
-class AbstractDataset(_AbstractDataset):
-    """Abstract class for all datasets"""
-    def _read_sample(self, index: int) -> Tuple[tf.Tensor, Any]:
-        img_name, target = self.data[index]
-        # Check target
-        if isinstance(target, dict):
-            assert "boxes" in target, "Target should contain 'boxes' key"
-            assert "labels" in target, "Target should contain 'labels' key"
-        elif isinstance(target, tuple):
-            assert len(target) == 2
-            assert isinstance(target[0], str) or isinstance(
-                target[0], np.ndarray
-            ), "first element of the tuple should be a string or a numpy array"
-            assert isinstance(target[1], list), "second element of the tuple should be a list"
-        else:
-            assert isinstance(target, str) or isinstance(
-                target, np.ndarray
-            ), "Target should be a string or a numpy array"
-        # Read image
-        img = (
-            tensor_from_numpy(img_name, dtype=tf.float32)
-            if isinstance(img_name, np.ndarray)
-            else read_img_as_tensor(os.path.join(self.root, img_name), dtype=tf.float32)
-        )
-        return img, deepcopy(target)
-    @staticmethod
-    def collate_fn(samples: List[Tuple[tf.Tensor, Any]]) -> Tuple[tf.Tensor, List[Any]]:
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-        return images, list(targets)
-class VisionDataset(AbstractDataset, _VisionDataset):  # noqa: D101
-    pass

doctr/datasets/detection.py DELETED Viewed

@@ -1,98 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from typing import Any, Dict, List, Tuple, Type, Union
-import numpy as np
-from doctr.file_utils import CLASS_NAME
-from .datasets import AbstractDataset
-from .utils import pre_transform_multiclass
-__all__ = ["DetectionDataset"]
-class DetectionDataset(AbstractDataset):
-    """Implements a text detection dataset
-    >>> from doctr.datasets import DetectionDataset
-    >>> train_set = DetectionDataset(img_folder="/path/to/images",
-    >>>                              label_path="/path/to/labels.json")
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_path: path to the annotations of each image
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        use_polygons: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            img_folder,
-            pre_transforms=pre_transform_multiclass,
-            **kwargs,
-        )
-        # File existence check
-        self._class_names: List = []
-        if not os.path.exists(label_path):
-            raise FileNotFoundError(f"unable to locate {label_path}")
-        with open(label_path, "rb") as f:
-            labels = json.load(f)
-        self.data: List[Tuple[str, Tuple[np.ndarray, List[str]]]] = []
-        np_dtype = np.float32
-        for img_name, label in labels.items():
-            # File existence check
-            if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-            geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype)
-            self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes)))
-    def format_polygons(
-        self, polygons: Union[List, Dict], use_polygons: bool, np_dtype: Type
-    ) -> Tuple[np.ndarray, List[str]]:
-        """Format polygons into an array
-        Args:
-        ----
-            polygons: the bounding boxes
-            use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-            np_dtype: dtype of array
-        Returns:
-        -------
-            geoms: bounding boxes as np array
-            polygons_classes: list of classes for each bounding box
-        """
-        if isinstance(polygons, list):
-            self._class_names += [CLASS_NAME]
-            polygons_classes = [CLASS_NAME for _ in polygons]
-            _polygons: np.ndarray = np.asarray(polygons, dtype=np_dtype)
-        elif isinstance(polygons, dict):
-            self._class_names += list(polygons.keys())
-            polygons_classes = [k for k, v in polygons.items() for _ in v]
-            _polygons = np.concatenate([np.asarray(poly, dtype=np_dtype) for poly in polygons.values() if poly], axis=0)
-        else:
-            raise TypeError(f"polygons should be a dictionary or list, it was {type(polygons)}")
-        geoms = _polygons if use_polygons else np.concatenate((_polygons.min(axis=1), _polygons.max(axis=1)), axis=1)
-        return geoms, polygons_classes
-    @property
-    def class_names(self):
-        return sorted(set(self._class_names))

doctr/datasets/doc_artefacts.py DELETED Viewed

@@ -1,82 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from typing import Any, Dict, List, Tuple
-import numpy as np
-from .datasets import VisionDataset
-__all__ = ["DocArtefacts"]
-class DocArtefacts(VisionDataset):
-    """Object detection dataset for non-textual elements in documents.
-    The dataset includes a variety of synthetic document pages with non-textual elements.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/artefacts-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import DocArtefacts
-    >>> train_set = DocArtefacts(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    URL = "https://doctr-static.mindee.com/models?id=v0.4.0/artefact_detection-13fab8ce.zip&src=0"
-    SHA256 = "13fab8ced7f84583d9dccd0c634f046c3417e62a11fe1dea6efbbaba5052471b"
-    CLASSES = ["background", "qr_code", "bar_code", "logo", "photo"]
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(self.URL, None, self.SHA256, True, **kwargs)
-        self.train = train
-        # Update root
-        self.root = os.path.join(self.root, "train" if train else "val")
-        # List images
-        tmp_root = os.path.join(self.root, "images")
-        with open(os.path.join(self.root, "labels.json"), "rb") as f:
-            labels = json.load(f)
-        self.data: List[Tuple[str, Dict[str, Any]]] = []
-        img_list = os.listdir(tmp_root)
-        if len(labels) != len(img_list):
-            raise AssertionError("the number of images and labels do not match")
-        np_dtype = np.float32
-        for img_name, label in labels.items():
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
-            # xmin, ymin, xmax, ymax
-            boxes: np.ndarray = np.asarray([obj["geometry"] for obj in label], dtype=np_dtype)
-            classes: np.ndarray = np.asarray([self.CLASSES.index(obj["label"]) for obj in label], dtype=np.int64)
-            if use_polygons:
-                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                boxes = np.stack(
-                    [
-                        np.stack([boxes[:, 0], boxes[:, 1]], axis=-1),
-                        np.stack([boxes[:, 2], boxes[:, 1]], axis=-1),
-                        np.stack([boxes[:, 2], boxes[:, 3]], axis=-1),
-                        np.stack([boxes[:, 0], boxes[:, 3]], axis=-1),
-                    ],
-                    axis=1,
-                )
-            self.data.append((img_name, dict(boxes=boxes, labels=classes)))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/funsd.py DELETED Viewed

@@ -1,112 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["FUNSD"]
-class FUNSD(VisionDataset):
-    """FUNSD dataset from `"FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents"
-    <https://arxiv.org/pdf/1905.13538.pdf>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/funsd-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import FUNSD
-    >>> train_set = FUNSD(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    URL = "https://guillaumejaume.github.io/FUNSD/dataset.zip"
-    SHA256 = "c31735649e4f441bcbb4fd0f379574f7520b42286e80b01d80b445649d54761f"
-    FILE_NAME = "funsd.zip"
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            self.URL,
-            self.FILE_NAME,
-            self.SHA256,
-            True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        np_dtype = np.float32
-        # Use the subset
-        subfolder = os.path.join("dataset", "training_data" if train else "testing_data")
-        # # List images
-        tmp_root = os.path.join(self.root, subfolder, "images")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking FUNSD", total=len(os.listdir(tmp_root))):
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, img_path)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
-            stem = Path(img_path).stem
-            with open(os.path.join(self.root, subfolder, "annotations", f"{stem}.json"), "rb") as f:
-                data = json.load(f)
-            _targets = [
-                (word["text"], word["box"])
-                for block in data["form"]
-                for word in block["words"]
-                if len(word["text"]) > 0
-            ]
-            text_targets, box_targets = zip(*_targets)
-            if use_polygons:
-                # xmin, ymin, xmax, ymax -> (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                box_targets = [  # type: ignore[assignment]
-                    [
-                        [box[0], box[1]],
-                        [box[2], box[1]],
-                        [box[2], box[3]],
-                        [box[0], box[3]],
-                    ]
-                    for box in box_targets
-                ]
-            if recognition_task:
-                crops = crop_bboxes_from_image(
-                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=np_dtype)
-                )
-                for crop, label in zip(crops, list(text_targets)):
-                    # filter labels with unknown characters
-                    if not any(char in label for char in ["☑", "☐", "\uf703", "\uf702"]):
-                        self.data.append((crop, label))
-            else:
-                self.data.append((
-                    img_path,
-                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(text_targets)),
-                ))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/generator/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *  # type: ignore[assignment]

doctr/datasets/generator/base.py DELETED Viewed

@@ -1,155 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import random
-from typing import Any, Callable, List, Optional, Tuple, Union
-from PIL import Image, ImageDraw
-from doctr.io.image import tensor_from_pil
-from doctr.utils.fonts import get_font
-from ..datasets import AbstractDataset
-def synthesize_text_img(
-    text: str,
-    font_size: int = 32,
-    font_family: Optional[str] = None,
-    background_color: Optional[Tuple[int, int, int]] = None,
-    text_color: Optional[Tuple[int, int, int]] = None,
-) -> Image.Image:
-    """Generate a synthetic text image
-    Args:
-    ----
-        text: the text to render as an image
-        font_size: the size of the font
-        font_family: the font family (has to be installed on your system)
-        background_color: background color of the final image
-        text_color: text color on the final image
-    Returns:
-    -------
-        PIL image of the text
-    """
-    background_color = (0, 0, 0) if background_color is None else background_color
-    text_color = (255, 255, 255) if text_color is None else text_color
-    font = get_font(font_family, font_size)
-    left, top, right, bottom = font.getbbox(text)
-    text_w, text_h = right - left, bottom - top
-    h, w = int(round(1.3 * text_h)), int(round(1.1 * text_w))
-    # If single letter, make the image square, otherwise expand to meet the text size
-    img_size = (h, w) if len(text) > 1 else (max(h, w), max(h, w))
-    img = Image.new("RGB", img_size[::-1], color=background_color)
-    d = ImageDraw.Draw(img)
-    # Offset so that the text is centered
-    text_pos = (int(round((img_size[1] - text_w) / 2)), int(round((img_size[0] - text_h) / 2)))
-    # Draw the text
-    d.text(text_pos, text, font=font, fill=text_color)
-    return img
-class _CharacterGenerator(AbstractDataset):
-    def __init__(
-        self,
-        vocab: str,
-        num_samples: int,
-        cache_samples: bool = False,
-        font_family: Optional[Union[str, List[str]]] = None,
-        img_transforms: Optional[Callable[[Any], Any]] = None,
-        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
-    ) -> None:
-        self.vocab = vocab
-        self._num_samples = num_samples
-        self.font_family = font_family if isinstance(font_family, list) else [font_family]  # type: ignore[list-item]
-        # Validate fonts
-        if isinstance(font_family, list):
-            for font in self.font_family:
-                try:
-                    _ = get_font(font, 10)
-                except OSError:
-                    raise ValueError(f"unable to locate font: {font}")
-        self.img_transforms = img_transforms
-        self.sample_transforms = sample_transforms
-        self._data: List[Image.Image] = []
-        if cache_samples:
-            self._data = [
-                (synthesize_text_img(char, font_family=font), idx)  # type: ignore[misc]
-                for idx, char in enumerate(self.vocab)
-                for font in self.font_family
-            ]
-    def __len__(self) -> int:
-        return self._num_samples
-    def _read_sample(self, index: int) -> Tuple[Any, int]:
-        # Samples are already cached
-        if len(self._data) > 0:
-            idx = index % len(self._data)
-            pil_img, target = self._data[idx]  # type: ignore[misc]
-        else:
-            target = index % len(self.vocab)
-            pil_img = synthesize_text_img(self.vocab[target], font_family=random.choice(self.font_family))
-        img = tensor_from_pil(pil_img)
-        return img, target
-class _WordGenerator(AbstractDataset):
-    def __init__(
-        self,
-        vocab: str,
-        min_chars: int,
-        max_chars: int,
-        num_samples: int,
-        cache_samples: bool = False,
-        font_family: Optional[Union[str, List[str]]] = None,
-        img_transforms: Optional[Callable[[Any], Any]] = None,
-        sample_transforms: Optional[Callable[[Any, Any], Tuple[Any, Any]]] = None,
-    ) -> None:
-        self.vocab = vocab
-        self.wordlen_range = (min_chars, max_chars)
-        self._num_samples = num_samples
-        self.font_family = font_family if isinstance(font_family, list) else [font_family]  # type: ignore[list-item]
-        # Validate fonts
-        if isinstance(font_family, list):
-            for font in self.font_family:
-                try:
-                    _ = get_font(font, 10)
-                except OSError:
-                    raise ValueError(f"unable to locate font: {font}")
-        self.img_transforms = img_transforms
-        self.sample_transforms = sample_transforms
-        self._data: List[Image.Image] = []
-        if cache_samples:
-            _words = [self._generate_string(*self.wordlen_range) for _ in range(num_samples)]
-            self._data = [
-                (synthesize_text_img(text, font_family=random.choice(self.font_family)), text)  # type: ignore[misc]
-                for text in _words
-            ]
-    def _generate_string(self, min_chars: int, max_chars: int) -> str:
-        num_chars = random.randint(min_chars, max_chars)
-        return "".join(random.choice(self.vocab) for _ in range(num_chars))
-    def __len__(self) -> int:
-        return self._num_samples
-    def _read_sample(self, index: int) -> Tuple[Any, str]:
-        # Samples are already cached
-        if len(self._data) > 0:
-            pil_img, target = self._data[index]  # type: ignore[misc]
-        else:
-            target = self._generate_string(*self.wordlen_range)
-            pil_img = synthesize_text_img(target, font_family=random.choice(self.font_family))
-        img = tensor_from_pil(pil_img)
-        return img, target

doctr/datasets/generator/pytorch.py DELETED Viewed

@@ -1,54 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from torch.utils.data._utils.collate import default_collate
-from .base import _CharacterGenerator, _WordGenerator
-__all__ = ["CharacterGenerator", "WordGenerator"]
-class CharacterGenerator(_CharacterGenerator):
-    """Implements a character image generation dataset
-    >>> from doctr.datasets import CharacterGenerator
-    >>> ds = CharacterGenerator(vocab='abdef', num_samples=100)
-    >>> img, target = ds[0]
-    Args:
-    ----
-        vocab: vocabulary to take the character from
-        num_samples: number of samples that will be generated iterating over the dataset
-        cache_samples: whether generated images should be cached firsthand
-        font_family: font to use to generate the text images
-        img_transforms: composable transformations that will be applied to each image
-        sample_transforms: composable transformations that will be applied to both the image and the target
-    """
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        setattr(self, "collate_fn", default_collate)
-class WordGenerator(_WordGenerator):
-    """Implements a character image generation dataset
-    >>> from doctr.datasets import WordGenerator
-    >>> ds = WordGenerator(vocab='abdef', min_chars=1, max_chars=32, num_samples=100)
-    >>> img, target = ds[0]
-    Args:
-    ----
-        vocab: vocabulary to take the character from
-        min_chars: minimum number of characters in a word
-        max_chars: maximum number of characters in a word
-        num_samples: number of samples that will be generated iterating over the dataset
-        cache_samples: whether generated images should be cached firsthand
-        font_family: font to use to generate the text images
-        img_transforms: composable transformations that will be applied to each image
-        sample_transforms: composable transformations that will be applied to both the image and the target
-    """
-    pass

doctr/datasets/generator/tensorflow.py DELETED Viewed

@@ -1,60 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import tensorflow as tf
-from .base import _CharacterGenerator, _WordGenerator
-__all__ = ["CharacterGenerator", "WordGenerator"]
-class CharacterGenerator(_CharacterGenerator):
-    """Implements a character image generation dataset
-    >>> from doctr.datasets import CharacterGenerator
-    >>> ds = CharacterGenerator(vocab='abdef', num_samples=100)
-    >>> img, target = ds[0]
-    Args:
-    ----
-        vocab: vocabulary to take the character from
-        num_samples: number of samples that will be generated iterating over the dataset
-        cache_samples: whether generated images should be cached firsthand
-        font_family: font to use to generate the text images
-        img_transforms: composable transformations that will be applied to each image
-        sample_transforms: composable transformations that will be applied to both the image and the target
-    """
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-    @staticmethod
-    def collate_fn(samples):
-        images, targets = zip(*samples)
-        images = tf.stack(images, axis=0)
-        return images, tf.convert_to_tensor(targets)
-class WordGenerator(_WordGenerator):
-    """Implements a character image generation dataset
-    >>> from doctr.datasets import WordGenerator
-    >>> ds = WordGenerator(vocab='abdef', min_chars=1, max_chars=32, num_samples=100)
-    >>> img, target = ds[0]
-    Args:
-    ----
-        vocab: vocabulary to take the character from
-        min_chars: minimum number of characters in a word
-        max_chars: maximum number of characters in a word
-        num_samples: number of samples that will be generated iterating over the dataset
-        cache_samples: whether generated images should be cached firsthand
-        font_family: font to use to generate the text images
-        img_transforms: composable transformations that will be applied to each image
-        sample_transforms: composable transformations that will be applied to both the image and the target
-    """
-    pass

doctr/datasets/ic03.py DELETED Viewed

@@ -1,126 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, Dict, List, Tuple, Union
-import defusedxml.ElementTree as ET
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["IC03"]
-class IC03(VisionDataset):
-    """IC03 dataset from `"ICDAR 2003 Robust Reading Competitions: Entries, Results and Future Directions"
-    <http://www.iapr-tc11.org/mediawiki/index.php?title=ICDAR_2003_Robust_Reading_Competitions>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic03-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import IC03
-    >>> train_set = IC03(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    TRAIN = (
-        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTrain/scene.zip",
-        "9d86df514eb09dd693fb0b8c671ef54a0cfe02e803b1bbef9fc676061502eb94",
-        "ic03_train.zip",
-    )
-    TEST = (
-        "http://www.iapr-tc11.org/dataset/ICDAR2003_RobustReading/TrialTest/scene.zip",
-        "dbc4b5fd5d04616b8464a1b42ea22db351ee22c2546dd15ac35611857ea111f8",
-        "ic03_test.zip",
-    )
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        url, sha256, file_name = self.TRAIN if train else self.TEST
-        super().__init__(
-            url,
-            file_name,
-            sha256,
-            True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        # Load xml data
-        tmp_root = (
-            os.path.join(self.root, "SceneTrialTrain" if self.train else "SceneTrialTest") if sha256 else self.root
-        )
-        xml_tree = ET.parse(os.path.join(tmp_root, "words.xml"))
-        xml_root = xml_tree.getroot()
-        for image in tqdm(iterable=xml_root, desc="Unpacking IC03", total=len(xml_root)):
-            name, _resolution, rectangles = image
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, name.text)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
-            if use_polygons:
-                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                _boxes = [
-                    [
-                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
-                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
-                        [
-                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
-                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
-                        ],
-                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
-                    ]
-                    for rect in rectangles
-                ]
-            else:
-                # x_min, y_min, x_max, y_max
-                _boxes = [
-                    [
-                        float(rect.attrib["x"]),  # type: ignore[list-item]
-                        float(rect.attrib["y"]),  # type: ignore[list-item]
-                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
-                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
-                    ]
-                    for rect in rectangles
-                ]
-            # filter images without boxes
-            if len(_boxes) > 0:
-                boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
-                # Get the labels
-                labels = [lab.text for rect in rectangles for lab in rect if lab.text]
-                if recognition_task:
-                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
-                    for crop, label in zip(crops, labels):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                            self.data.append((crop, label))
-                else:
-                    self.data.append((name.text, dict(boxes=boxes, labels=labels)))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/ic13.py DELETED Viewed

@@ -1,99 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import csv
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from tqdm import tqdm
-from .datasets import AbstractDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["IC13"]
-class IC13(AbstractDataset):
-    """IC13 dataset from `"ICDAR 2013 Robust Reading Competition" <https://rrc.cvc.uab.es/>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/ic13-grid.png&src=0
-        :align: center
-    >>> # NOTE: You need to download both image and label parts from Focused Scene Text challenge Task2.1 2013-2015.
-    >>> from doctr.datasets import IC13
-    >>> train_set = IC13(img_folder="/path/to/Challenge2_Training_Task12_Images",
-    >>>                  label_folder="/path/to/Challenge2_Training_Task1_GT")
-    >>> img, target = train_set[0]
-    >>> test_set = IC13(img_folder="/path/to/Challenge2_Test_Task12_Images",
-    >>>                 label_folder="/path/to/Challenge2_Test_Task1_GT")
-    >>> img, target = test_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_folder: folder with all annotation files for the images
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_folder: str,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
-        )
-        # File existence check
-        if not os.path.exists(label_folder) or not os.path.exists(img_folder):
-            raise FileNotFoundError(
-                f"unable to locate {label_folder if not os.path.exists(label_folder) else img_folder}"
-            )
-        self.data: List[Tuple[Union[Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        img_names = os.listdir(img_folder)
-        for img_name in tqdm(iterable=img_names, desc="Unpacking IC13", total=len(img_names)):
-            img_path = Path(img_folder, img_name)
-            label_path = Path(label_folder, "gt_" + Path(img_name).stem + ".txt")
-            with open(label_path, newline="\n") as f:
-                _lines = [
-                    [val[:-1] if val.endswith(",") else val for val in row]
-                    for row in csv.reader(f, delimiter=" ", quotechar="'")
-                ]
-            labels = [line[-1].replace('"', "") for line in _lines]
-            # xmin, ymin, xmax, ymax
-            box_targets: np.ndarray = np.array([list(map(int, line[:4])) for line in _lines], dtype=np_dtype)
-            if use_polygons:
-                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                box_targets = np.array(
-                    [
-                        [
-                            [coords[0], coords[1]],
-                            [coords[2], coords[1]],
-                            [coords[2], coords[3]],
-                            [coords[0], coords[3]],
-                        ]
-                        for coords in box_targets
-                    ],
-                    dtype=np_dtype,
-                )
-            if recognition_task:
-                crops = crop_bboxes_from_image(img_path=img_path, geoms=box_targets)
-                for crop, label in zip(crops, labels):
-                    self.data.append((crop, label))
-            else:
-                self.data.append((img_path, dict(boxes=box_targets, labels=labels)))

doctr/datasets/iiit5k.py DELETED Viewed

@@ -1,103 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-import scipy.io as sio
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative
-__all__ = ["IIIT5K"]
-class IIIT5K(VisionDataset):
-    """IIIT-5K character-level localization dataset from
-    `"BMVC 2012 Scene Text Recognition using Higher Order Language Priors"
-    <https://cdn.iiit.ac.in/cdn/cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/home/mishraBMVC12.pdf>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/iiit5k-grid.png&src=0
-        :align: center
-    >>> # NOTE: this dataset is for character-level localization
-    >>> from doctr.datasets import IIIT5K
-    >>> train_set = IIIT5K(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    URL = "https://cvit.iiit.ac.in/images/Projects/SceneTextUnderstanding/IIIT5K-Word_V3.0.tar.gz"
-    SHA256 = "7872c9efbec457eb23f3368855e7738f72ce10927f52a382deb4966ca0ffa38e"
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            self.URL,
-            None,
-            file_hash=self.SHA256,
-            extract_archive=True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        # Load mat data
-        tmp_root = os.path.join(self.root, "IIIT5K") if self.SHA256 else self.root
-        mat_file = "trainCharBound" if self.train else "testCharBound"
-        mat_data = sio.loadmat(os.path.join(tmp_root, f"{mat_file}.mat"))[mat_file][0]
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        for img_path, label, box_targets in tqdm(iterable=mat_data, desc="Unpacking IIIT5K", total=len(mat_data)):
-            _raw_path = img_path[0]
-            _raw_label = label[0]
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, _raw_path)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, _raw_path)}")
-            if recognition_task:
-                self.data.append((_raw_path, _raw_label))
-            else:
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box_targets = [
-                        [
-                            [box[0], box[1]],
-                            [box[0] + box[2], box[1]],
-                            [box[0] + box[2], box[1] + box[3]],
-                            [box[0], box[1] + box[3]],
-                        ]
-                        for box in box_targets
-                    ]
-                else:
-                    # xmin, ymin, xmax, ymax
-                    box_targets = [[box[0], box[1], box[0] + box[2], box[1] + box[3]] for box in box_targets]
-                # label are casted to list where each char corresponds to the character's bounding box
-                self.data.append((
-                    _raw_path,
-                    dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=list(_raw_label)),
-                ))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/iiithws.py DELETED Viewed

@@ -1,75 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from random import sample
-from typing import Any, List, Tuple
-from tqdm import tqdm
-from .datasets import AbstractDataset
-__all__ = ["IIITHWS"]
-class IIITHWS(AbstractDataset):
-    """IIITHWS dataset from `"Generating Synthetic Data for Text Recognition"
-    <https://arxiv.org/pdf/1608.04224.pdf>`_ | `"repository" <https://github.com/kris314/hwnet>`_ |
-    `"website" <https://cvit.iiit.ac.in/research/projects/cvit-projects/matchdocimgs>`_.
-    >>> # NOTE: This is a pure recognition dataset without bounding box labels.
-    >>> # NOTE: You need to download the dataset.
-    >>> from doctr.datasets import IIITHWS
-    >>> train_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
-    >>>                     label_path="/path/to/IIIT-HWS-90K.txt",
-    >>>                     train=True)
-    >>> img, target = train_set[0]
-    >>> test_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
-    >>>                    label_path="/path/to/IIIT-HWS-90K.txt")
-    >>>                    train=False)
-    >>> img, target = test_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_path: path to the file with the labels
-        train: whether the subset should be the training one
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        train: bool = True,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(img_folder, **kwargs)
-        # File existence check
-        if not os.path.exists(label_path) or not os.path.exists(img_folder):
-            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
-        self.data: List[Tuple[str, str]] = []
-        self.train = train
-        with open(label_path) as f:
-            annotations = f.readlines()
-        # Shuffle the dataset otherwise the test set will contain the same labels n times
-        annotations = sample(annotations, len(annotations))
-        train_samples = int(len(annotations) * 0.9)
-        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
-        for annotation in tqdm(
-            iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
-        ):
-            img_path, label = annotation.split()[0:2]
-            img_path = os.path.join(img_folder, img_path)
-            self.data.append((img_path, label))
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/imgur5k.py DELETED Viewed

@@ -1,147 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import glob
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import cv2
-import numpy as np
-from PIL import Image
-from tqdm import tqdm
-from .datasets import AbstractDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["IMGUR5K"]
-class IMGUR5K(AbstractDataset):
-    """IMGUR5K dataset from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example"
-    <https://arxiv.org/abs/2106.08385>`_ |
-    `repository <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/imgur5k-grid.png&src=0
-        :align: center
-        :width: 630
-        :height: 400
-    >>> # NOTE: You need to download/generate the dataset from the repository.
-    >>> from doctr.datasets import IMGUR5K
-    >>> train_set = IMGUR5K(train=True, img_folder="/path/to/IMGUR5K-Handwriting-Dataset/images",
-    >>>                     label_path="/path/to/IMGUR5K-Handwriting-Dataset/dataset_info/imgur5k_annotations.json")
-    >>> img, target = train_set[0]
-    >>> test_set = IMGUR5K(train=False, img_folder="/path/to/IMGUR5K-Handwriting-Dataset/images",
-    >>>                    label_path="/path/to/IMGUR5K-Handwriting-Dataset/dataset_info/imgur5k_annotations.json")
-    >>> img, target = test_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_path: path to the annotations file of the dataset
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
-        )
-        # File existence check
-        if not os.path.exists(label_path) or not os.path.exists(img_folder):
-            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
-        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        self.train = train
-        np_dtype = np.float32
-        img_names = os.listdir(img_folder)
-        train_samples = int(len(img_names) * 0.9)
-        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
-        # define folder to write IMGUR5K recognition dataset
-        reco_folder_name = "IMGUR5K_recognition_train" if self.train else "IMGUR5K_recognition_test"
-        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
-        reco_folder_path = os.path.join(os.path.dirname(self.root), reco_folder_name)
-        reco_images_counter = 0
-        if recognition_task and os.path.isdir(reco_folder_path):
-            self._read_from_folder(reco_folder_path)
-            return
-        elif recognition_task and not os.path.isdir(reco_folder_path):
-            os.makedirs(reco_folder_path, exist_ok=False)
-        with open(label_path) as f:
-            annotation_file = json.load(f)
-        for img_name in tqdm(iterable=img_names[set_slice], desc="Unpacking IMGUR5K", total=len(img_names[set_slice])):
-            img_path = Path(img_folder, img_name)
-            img_id = img_name.split(".")[0]
-            # File existence check
-            if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-            # some files have no annotations which are marked with only a dot in the 'word' key
-            # ref: https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset/blob/main/README.md
-            if img_id not in annotation_file["index_to_ann_map"].keys():
-                continue
-            ann_ids = annotation_file["index_to_ann_map"][img_id]
-            annotations = [annotation_file["ann_id"][a_id] for a_id in ann_ids]
-            labels = [ann["word"] for ann in annotations if ann["word"] != "."]
-            # x_center, y_center, width, height, angle
-            _boxes = [
-                list(map(float, ann["bounding_box"].strip("[ ]").split(", ")))
-                for ann in annotations
-                if ann["word"] != "."
-            ]
-            # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-            box_targets = [cv2.boxPoints(((box[0], box[1]), (box[2], box[3]), box[4])) for box in _boxes]  # type: ignore[arg-type]
-            if not use_polygons:
-                # xmin, ymin, xmax, ymax
-                box_targets = [np.concatenate((points.min(0), points.max(0)), axis=-1) for points in box_targets]
-            # filter images without boxes
-            if len(box_targets) > 0:
-                if recognition_task:
-                    crops = crop_bboxes_from_image(
-                        img_path=os.path.join(self.root, img_name), geoms=np.asarray(box_targets, dtype=np_dtype)
-                    )
-                    for crop, label in zip(crops, labels):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                            # write data to disk
-                            with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
-                                f.write(label)
-                                tmp_img = Image.fromarray(crop)
-                                tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
-                                reco_images_counter += 1
-                else:
-                    self.data.append((img_path, dict(boxes=np.asarray(box_targets, dtype=np_dtype), labels=labels)))
-        if recognition_task:
-            self._read_from_folder(reco_folder_path)
-    def extra_repr(self) -> str:
-        return f"train={self.train}"
-    def _read_from_folder(self, path: str) -> None:
-        for img_path in glob.glob(os.path.join(path, "*.png")):
-            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
-                self.data.append((img_path, f.read()))

doctr/datasets/loader.py DELETED Viewed

@@ -1,102 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import math
-from typing import Callable, Optional
-import numpy as np
-import tensorflow as tf
-from doctr.utils.multithreading import multithread_exec
-__all__ = ["DataLoader"]
-def default_collate(samples):
-    """Collate multiple elements into batches
-    Args:
-    ----
-        samples: list of N tuples containing M elements
-    Returns:
-    -------
-        Tuple of M sequences contianing N elements each
-    """
-    batch_data = zip(*samples)
-    tf_data = tuple(tf.stack(elt, axis=0) for elt in batch_data)
-    return tf_data
-class DataLoader:
-    """Implements a dataset wrapper for fast data loading
-    >>> from doctr.datasets import CORD, DataLoader
-    >>> train_set = CORD(train=True, download=True)
-    >>> train_loader = DataLoader(train_set, batch_size=32)
-    >>> train_iter = iter(train_loader)
-    >>> images, targets = next(train_iter)
-    Args:
-    ----
-        dataset: the dataset
-        shuffle: whether the samples should be shuffled before passing it to the iterator
-        batch_size: number of elements in each batch
-        drop_last: if `True`, drops the last batch if it isn't full
-        num_workers: number of workers to use for data loading
-        collate_fn: function to merge samples into a batch
-    """
-    def __init__(
-        self,
-        dataset,
-        shuffle: bool = True,
-        batch_size: int = 1,
-        drop_last: bool = False,
-        num_workers: Optional[int] = None,
-        collate_fn: Optional[Callable] = None,
-    ) -> None:
-        self.dataset = dataset
-        self.shuffle = shuffle
-        self.batch_size = batch_size
-        nb = len(self.dataset) / batch_size
-        self.num_batches = math.floor(nb) if drop_last else math.ceil(nb)
-        if collate_fn is None:
-            self.collate_fn = self.dataset.collate_fn if hasattr(self.dataset, "collate_fn") else default_collate
-        else:
-            self.collate_fn = collate_fn
-        self.num_workers = num_workers
-        self.reset()
-    def __len__(self) -> int:
-        return self.num_batches
-    def reset(self) -> None:
-        # Updates indices after each epoch
-        self._num_yielded = 0
-        self.indices = np.arange(len(self.dataset))
-        if self.shuffle is True:
-            np.random.shuffle(self.indices)
-    def __iter__(self):
-        self.reset()
-        return self
-    def __next__(self):
-        if self._num_yielded < self.num_batches:
-            # Get next indices
-            idx = self._num_yielded * self.batch_size
-            indices = self.indices[idx : min(len(self.dataset), idx + self.batch_size)]
-            samples = list(multithread_exec(self.dataset.__getitem__, indices, threads=self.num_workers))
-            batch_data = self.collate_fn(samples)
-            self._num_yielded += 1
-            return batch_data
-        else:
-            raise StopIteration

doctr/datasets/mjsynth.py DELETED Viewed

@@ -1,106 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, List, Tuple
-from tqdm import tqdm
-from .datasets import AbstractDataset
-__all__ = ["MJSynth"]
-class MJSynth(AbstractDataset):
-    """MJSynth dataset from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition"
-    <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
-    >>> # NOTE: This is a pure recognition dataset without bounding box labels.
-    >>> # NOTE: You need to download the dataset.
-    >>> from doctr.datasets import MJSynth
-    >>> train_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
-    >>>                     label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt",
-    >>>                     train=True)
-    >>> img, target = train_set[0]
-    >>> test_set = MJSynth(img_folder="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px",
-    >>>                    label_path="/path/to/mjsynth/mnt/ramdisk/max/90kDICT32px/imlist.txt")
-    >>>                    train=False)
-    >>> img, target = test_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_path: path to the file with the labels
-        train: whether the subset should be the training one
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    # filter corrupted or missing images
-    BLACKLIST = [
-        "./1881/4/225_Marbling_46673.jpg\n",
-        "./2069/4/192_whittier_86389.jpg\n",
-        "./869/4/234_TRIASSIC_80582.jpg\n",
-        "./173/2/358_BURROWING_10395.jpg\n",
-        "./913/4/231_randoms_62372.jpg\n",
-        "./596/2/372_Ump_81662.jpg\n",
-        "./936/2/375_LOCALITIES_44992.jpg\n",
-        "./2540/4/246_SQUAMOUS_73902.jpg\n",
-        "./1332/4/224_TETHERED_78397.jpg\n",
-        "./627/6/83_PATRIARCHATE_55931.jpg\n",
-        "./2013/2/370_refract_63890.jpg\n",
-        "./2911/6/77_heretical_35885.jpg\n",
-        "./1730/2/361_HEREON_35880.jpg\n",
-        "./2194/2/334_EFFLORESCENT_24742.jpg\n",
-        "./2025/2/364_SNORTERS_72304.jpg\n",
-        "./368/4/232_friar_30876.jpg\n",
-        "./275/6/96_hackle_34465.jpg\n",
-        "./384/4/220_bolts_8596.jpg\n",
-        "./905/4/234_Postscripts_59142.jpg\n",
-        "./2749/6/101_Chided_13155.jpg\n",
-        "./495/6/81_MIDYEAR_48332.jpg\n",
-        "./2852/6/60_TOILSOME_79481.jpg\n",
-        "./554/2/366_Teleconferences_77948.jpg\n",
-        "./1696/4/211_Queened_61779.jpg\n",
-        "./2128/2/369_REDACTED_63458.jpg\n",
-        "./2557/2/351_DOWN_23492.jpg\n",
-        "./2489/4/221_snored_72290.jpg\n",
-        "./1650/2/355_stony_74902.jpg\n",
-        "./1863/4/223_Diligently_21672.jpg\n",
-        "./264/2/362_FORETASTE_30276.jpg\n",
-        "./429/4/208_Mainmasts_46140.jpg\n",
-        "./1817/2/363_actuating_904.jpg\n",
-    ]
-    def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        train: bool = True,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(img_folder, **kwargs)
-        # File existence check
-        if not os.path.exists(label_path) or not os.path.exists(img_folder):
-            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
-        self.data: List[Tuple[str, str]] = []
-        self.train = train
-        with open(label_path) as f:
-            img_paths = f.readlines()
-        train_samples = int(len(img_paths) * 0.9)
-        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
-        for path in tqdm(iterable=img_paths[set_slice], desc="Unpacking MJSynth", total=len(img_paths[set_slice])):
-            if path not in self.BLACKLIST:
-                label = path.split("_")[1]
-                img_path = os.path.join(img_folder, path[2:]).strip()
-                self.data.append((img_path, label))
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/ocr.py DELETED Viewed

@@ -1,71 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-import numpy as np
-from .datasets import AbstractDataset
-__all__ = ["OCRDataset"]
-class OCRDataset(AbstractDataset):
-    """Implements an OCR dataset
-    >>> from doctr.datasets import OCRDataset
-    >>> train_set = OCRDataset(img_folder="/path/to/images",
-    >>>                        label_file="/path/to/labels.json")
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        img_folder: local path to image folder (all jpg at the root)
-        label_file: local path to the label file
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_file: str,
-        use_polygons: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(img_folder, **kwargs)
-        # List images
-        self.data: List[Tuple[str, Dict[str, Any]]] = []
-        np_dtype = np.float32
-        with open(label_file, "rb") as f:
-            data = json.load(f)
-        for img_name, annotations in data.items():
-            # Get image path
-            img_name = Path(img_name)
-            # File existence check
-            if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-            # handle empty images
-            if len(annotations["typed_words"]) == 0:
-                self.data.append((img_name, dict(boxes=np.zeros((0, 4), dtype=np_dtype), labels=[])))
-                continue
-            # Unpack the straight boxes (xmin, ymin, xmax, ymax)
-            geoms = [list(map(float, obj["geometry"][:4])) for obj in annotations["typed_words"]]
-            if use_polygons:
-                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                geoms = [
-                    [geom[:2], [geom[2], geom[1]], geom[2:], [geom[0], geom[3]]]  # type: ignore[list-item]
-                    for geom in geoms
-                ]
-            text_targets = [obj["value"] for obj in annotations["typed_words"]]
-            self.data.append((img_name, dict(boxes=np.asarray(geoms, dtype=np_dtype), labels=text_targets)))

doctr/datasets/orientation.py DELETED Viewed

@@ -1,40 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, List, Tuple
-import numpy as np
-from .datasets import AbstractDataset
-__all__ = ["OrientationDataset"]
-class OrientationDataset(AbstractDataset):
-    """Implements a basic image dataset where targets are filled with zeros.
-    >>> from doctr.datasets import OrientationDataset
-    >>> train_set = OrientationDataset(img_folder="/path/to/images")
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            img_folder,
-            **kwargs,
-        )
-        # initialize dataset with 0 degree rotation targets
-        self.data: List[Tuple[str, np.ndarray]] = [(img_name, np.array([0])) for img_name in os.listdir(self.root)]

doctr/datasets/recognition.py DELETED Viewed

@@ -1,56 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from pathlib import Path
-from typing import Any, List, Tuple
-from .datasets import AbstractDataset
-__all__ = ["RecognitionDataset"]
-class RecognitionDataset(AbstractDataset):
-    """Dataset implementation for text recognition tasks
-    >>> from doctr.datasets import RecognitionDataset
-    >>> train_set = RecognitionDataset(img_folder="/path/to/images",
-    >>>                                labels_path="/path/to/labels.json")
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        img_folder: path to the images folder
-        labels_path: pathe to the json file containing all labels (character sequences)
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        labels_path: str,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(img_folder, **kwargs)
-        self.data: List[Tuple[str, str]] = []
-        with open(labels_path, encoding="utf-8") as f:
-            labels = json.load(f)
-        for img_name, label in labels.items():
-            if not os.path.exists(os.path.join(self.root, img_name)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}")
-            self.data.append((img_name, label))
-    def merge_dataset(self, ds: AbstractDataset) -> None:
-        # Update data with new root for self
-        self.data = [(str(Path(self.root).joinpath(img_path)), label) for img_path, label in self.data]
-        # Define new root
-        self.root = Path("/")
-        # Merge with ds data
-        for img_path, label in ds.data:
-            self.data.append((str(Path(ds.root).joinpath(img_path)), label))

doctr/datasets/sroie.py DELETED Viewed

@@ -1,103 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import csv
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["SROIE"]
-class SROIE(VisionDataset):
-    """SROIE dataset from `"ICDAR2019 Competition on Scanned Receipt OCR and Information Extraction"
-    <https://arxiv.org/pdf/2103.10213.pdf>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/sroie-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import SROIE
-    >>> train_set = SROIE(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    TRAIN = (
-        "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_train_task1.zip&src=0",
-        "d4fa9e60abb03500d83299c845b9c87fd9c9430d1aeac96b83c5d0bb0ab27f6f",
-        "sroie2019_train_task1.zip",
-    )
-    TEST = (
-        "https://doctr-static.mindee.com/models?id=v0.1.1/sroie2019_test.zip&src=0",
-        "41b3c746a20226fddc80d86d4b2a903d43b5be4f521dd1bbe759dbf8844745e2",
-        "sroie2019_test.zip",
-    )
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        url, sha256, name = self.TRAIN if train else self.TEST
-        super().__init__(
-            url,
-            name,
-            sha256,
-            True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        tmp_root = os.path.join(self.root, "images")
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        for img_path in tqdm(iterable=os.listdir(tmp_root), desc="Unpacking SROIE", total=len(os.listdir(tmp_root))):
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, img_path)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
-            stem = Path(img_path).stem
-            with open(os.path.join(self.root, "annotations", f"{stem}.txt"), encoding="latin") as f:
-                _rows = [row for row in list(csv.reader(f, delimiter=",")) if len(row) > 0]
-            labels = [",".join(row[8:]) for row in _rows]
-            # reorder coordinates (8 -> (4,2) ->
-            # (x, y) coordinates of top left, top right, bottom right, bottom left corners) and filter empty lines
-            coords: np.ndarray = np.stack(
-                [np.array(list(map(int, row[:8])), dtype=np_dtype).reshape((4, 2)) for row in _rows], axis=0
-            )
-            if not use_polygons:
-                # xmin, ymin, xmax, ymax
-                coords = np.concatenate((coords.min(axis=1), coords.max(axis=1)), axis=1)
-            if recognition_task:
-                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path), geoms=coords)
-                for crop, label in zip(crops, labels):
-                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                        self.data.append((crop, label))
-            else:
-                self.data.append((img_path, dict(boxes=coords, labels=labels)))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/svhn.py DELETED Viewed

@@ -1,131 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, Dict, List, Tuple, Union
-import h5py
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["SVHN"]
-class SVHN(VisionDataset):
-    """SVHN dataset from `"The Street View House Numbers (SVHN) Dataset"
-    <http://ufldl.stanford.edu/housenumbers/>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svhn-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import SVHN
-    >>> train_set = SVHN(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    TRAIN = (
-        "http://ufldl.stanford.edu/housenumbers/train.tar.gz",
-        "4b17bb33b6cd8f963493168f80143da956f28ec406cc12f8e5745a9f91a51898",
-        "svhn_train.tar",
-    )
-    TEST = (
-        "http://ufldl.stanford.edu/housenumbers/test.tar.gz",
-        "57ac9ceb530e4aa85b55d991be8fc49c695b3d71c6f6a88afea86549efde7fb5",
-        "svhn_test.tar",
-    )
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        url, sha256, name = self.TRAIN if train else self.TEST
-        super().__init__(
-            url,
-            file_name=name,
-            file_hash=sha256,
-            extract_archive=True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        tmp_root = os.path.join(self.root, "train" if train else "test")
-        # Load mat data (matlab v7.3 - can not be loaded with scipy)
-        with h5py.File(os.path.join(tmp_root, "digitStruct.mat"), "r") as f:
-            img_refs = f["digitStruct/name"]
-            box_refs = f["digitStruct/bbox"]
-            for img_ref, box_ref in tqdm(iterable=zip(img_refs, box_refs), desc="Unpacking SVHN", total=len(img_refs)):
-                # convert ascii matrix to string
-                img_name = "".join(map(chr, f[img_ref[0]][()].flatten()))
-                # File existence check
-                if not os.path.exists(os.path.join(tmp_root, img_name)):
-                    raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_name)}")
-                # Unpack the information
-                box = f[box_ref[0]]
-                if box["left"].shape[0] == 1:
-                    box_dict = {k: [int(vals[0][0])] for k, vals in box.items()}
-                else:
-                    box_dict = {k: [int(f[v[0]][()].item()) for v in vals] for k, vals in box.items()}
-                # Convert it to the right format
-                coords: np.ndarray = np.array(
-                    [box_dict["left"], box_dict["top"], box_dict["width"], box_dict["height"]], dtype=np_dtype
-                ).transpose()
-                label_targets = list(map(str, box_dict["label"]))
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box_targets: np.ndarray = np.stack(
-                        [
-                            np.stack([coords[:, 0], coords[:, 1]], axis=-1),
-                            np.stack([coords[:, 0] + coords[:, 2], coords[:, 1]], axis=-1),
-                            np.stack([coords[:, 0] + coords[:, 2], coords[:, 1] + coords[:, 3]], axis=-1),
-                            np.stack([coords[:, 0], coords[:, 1] + coords[:, 3]], axis=-1),
-                        ],
-                        axis=1,
-                    )
-                else:
-                    # x, y, width, height -> xmin, ymin, xmax, ymax
-                    box_targets = np.stack(
-                        [
-                            coords[:, 0],
-                            coords[:, 1],
-                            coords[:, 0] + coords[:, 2],
-                            coords[:, 1] + coords[:, 3],
-                        ],
-                        axis=-1,
-                    )
-                if recognition_task:
-                    crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_name), geoms=box_targets)
-                    for crop, label in zip(crops, label_targets):
-                        if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                            self.data.append((crop, label))
-                else:
-                    self.data.append((img_name, dict(boxes=box_targets, labels=label_targets)))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/svt.py DELETED Viewed

@@ -1,117 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import os
-from typing import Any, Dict, List, Tuple, Union
-import defusedxml.ElementTree as ET
-import numpy as np
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["SVT"]
-class SVT(VisionDataset):
-    """SVT dataset from `"The Street View Text Dataset - UCSD Computer Vision"
-    <http://vision.ucsd.edu/~kai/svt/>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import SVT
-    >>> train_set = SVT(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    URL = "http://vision.ucsd.edu/~kai/svt/svt.zip"
-    SHA256 = "63b3d55e6b6d1e036e2a844a20c034fe3af3c32e4d914d6e0c4a3cd43df3bebf"
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            self.URL,
-            None,
-            self.SHA256,
-            True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        # Load xml data
-        tmp_root = os.path.join(self.root, "svt1") if self.SHA256 else self.root
-        xml_tree = (
-            ET.parse(os.path.join(tmp_root, "train.xml"))
-            if self.train
-            else ET.parse(os.path.join(tmp_root, "test.xml"))
-        )
-        xml_root = xml_tree.getroot()
-        for image in tqdm(iterable=xml_root, desc="Unpacking SVT", total=len(xml_root)):
-            name, _, _, _resolution, rectangles = image
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, name.text)):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, name.text)}")
-            if use_polygons:
-                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                _boxes = [
-                    [
-                        [float(rect.attrib["x"]), float(rect.attrib["y"])],
-                        [float(rect.attrib["x"]) + float(rect.attrib["width"]), float(rect.attrib["y"])],
-                        [
-                            float(rect.attrib["x"]) + float(rect.attrib["width"]),
-                            float(rect.attrib["y"]) + float(rect.attrib["height"]),
-                        ],
-                        [float(rect.attrib["x"]), float(rect.attrib["y"]) + float(rect.attrib["height"])],
-                    ]
-                    for rect in rectangles
-                ]
-            else:
-                # x_min, y_min, x_max, y_max
-                _boxes = [
-                    [
-                        float(rect.attrib["x"]),  # type: ignore[list-item]
-                        float(rect.attrib["y"]),  # type: ignore[list-item]
-                        float(rect.attrib["x"]) + float(rect.attrib["width"]),  # type: ignore[list-item]
-                        float(rect.attrib["y"]) + float(rect.attrib["height"]),  # type: ignore[list-item]
-                    ]
-                    for rect in rectangles
-                ]
-            boxes: np.ndarray = np.asarray(_boxes, dtype=np_dtype)
-            # Get the labels
-            labels = [lab.text for rect in rectangles for lab in rect]
-            if recognition_task:
-                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, name.text), geoms=boxes)
-                for crop, label in zip(crops, labels):
-                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                        self.data.append((crop, label))
-            else:
-                self.data.append((name.text, dict(boxes=boxes, labels=labels)))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/datasets/synthtext.py DELETED Viewed

@@ -1,128 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import glob
-import os
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from PIL import Image
-from scipy import io as sio
-from tqdm import tqdm
-from .datasets import VisionDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["SynthText"]
-class SynthText(VisionDataset):
-    """SynthText dataset from `"Synthetic Data for Text Localisation in Natural Images"
-    <https://arxiv.org/abs/1604.06646>`_ | `"repository" <https://github.com/ankush-me/SynthText>`_ |
-    `"website" <https://www.robots.ox.ac.uk/~vgg/data/scenetext/>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.5.0/svt-grid.png&src=0
-        :align: center
-    >>> from doctr.datasets import SynthText
-    >>> train_set = SynthText(train=True, download=True)
-    >>> img, target = train_set[0]
-    Args:
-    ----
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `VisionDataset`.
-    """
-    URL = "https://thor.robots.ox.ac.uk/~vgg/data/scenetext/SynthText.zip"
-    SHA256 = "28ab030485ec8df3ed612c568dd71fb2793b9afbfa3a9d9c6e792aef33265bf1"
-    def __init__(
-        self,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            self.URL,
-            None,
-            file_hash=None,
-            extract_archive=True,
-            pre_transforms=convert_target_to_relative if not recognition_task else None,
-            **kwargs,
-        )
-        self.train = train
-        self.data: List[Tuple[Union[str, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        np_dtype = np.float32
-        # Load mat data
-        tmp_root = os.path.join(self.root, "SynthText") if self.SHA256 else self.root
-        # define folder to write SynthText recognition dataset
-        reco_folder_name = "SynthText_recognition_train" if self.train else "SynthText_recognition_test"
-        reco_folder_name = "Poly_" + reco_folder_name if use_polygons else reco_folder_name
-        reco_folder_path = os.path.join(tmp_root, reco_folder_name)
-        reco_images_counter = 0
-        if recognition_task and os.path.isdir(reco_folder_path):
-            self._read_from_folder(reco_folder_path)
-            return
-        elif recognition_task and not os.path.isdir(reco_folder_path):
-            os.makedirs(reco_folder_path, exist_ok=False)
-        mat_data = sio.loadmat(os.path.join(tmp_root, "gt.mat"))
-        train_samples = int(len(mat_data["imnames"][0]) * 0.9)
-        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
-        paths = mat_data["imnames"][0][set_slice]
-        boxes = mat_data["wordBB"][0][set_slice]
-        labels = mat_data["txt"][0][set_slice]
-        del mat_data
-        for img_path, word_boxes, txt in tqdm(
-            iterable=zip(paths, boxes, labels), desc="Unpacking SynthText", total=len(paths)
-        ):
-            # File existence check
-            if not os.path.exists(os.path.join(tmp_root, img_path[0])):
-                raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path[0])}")
-            labels = [elt for word in txt.tolist() for elt in word.split()]
-            # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-            word_boxes = (
-                word_boxes.transpose(2, 1, 0)
-                if word_boxes.ndim == 3
-                else np.expand_dims(word_boxes.transpose(1, 0), axis=0)
-            )
-            if not use_polygons:
-                # xmin, ymin, xmax, ymax
-                word_boxes = np.concatenate((word_boxes.min(axis=1), word_boxes.max(axis=1)), axis=1)
-            if recognition_task:
-                crops = crop_bboxes_from_image(img_path=os.path.join(tmp_root, img_path[0]), geoms=word_boxes)
-                for crop, label in zip(crops, labels):
-                    if crop.shape[0] > 0 and crop.shape[1] > 0 and len(label) > 0:
-                        # write data to disk
-                        with open(os.path.join(reco_folder_path, f"{reco_images_counter}.txt"), "w") as f:
-                            f.write(label)
-                            tmp_img = Image.fromarray(crop)
-                            tmp_img.save(os.path.join(reco_folder_path, f"{reco_images_counter}.png"))
-                            reco_images_counter += 1
-            else:
-                self.data.append((img_path[0], dict(boxes=np.asarray(word_boxes, dtype=np_dtype), labels=labels)))
-        if recognition_task:
-            self._read_from_folder(reco_folder_path)
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"
-    def _read_from_folder(self, path: str) -> None:
-        for img_path in glob.glob(os.path.join(path, "*.png")):
-            with open(os.path.join(path, f"{os.path.basename(img_path)[:-4]}.txt"), "r") as f:
-                self.data.append((img_path, f.read()))

doctr/datasets/utils.py DELETED Viewed

@@ -1,216 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import string
-import unicodedata
-from collections.abc import Sequence
-from functools import partial
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union
-from typing import Sequence as SequenceType
-import numpy as np
-from PIL import Image
-from doctr.io.image import get_img_shape
-from doctr.utils.geometry import convert_to_relative_coords, extract_crops, extract_rcrops
-from .vocabs import VOCABS
-__all__ = ["translate", "encode_string", "decode_sequence", "encode_sequences", "pre_transform_multiclass"]
-ImageTensor = TypeVar("ImageTensor")
-def translate(
-    input_string: str,
-    vocab_name: str,
-    unknown_char: str = "■",
-) -> str:
-    """Translate a string input in a given vocabulary
-    Args:
-    ----
-        input_string: input string to translate
-        vocab_name: vocabulary to use (french, latin, ...)
-        unknown_char: unknown character for non-translatable characters
-    Returns:
-    -------
-        A string translated in a given vocab
-    """
-    if VOCABS.get(vocab_name) is None:
-        raise KeyError("output vocabulary must be in vocabs dictionnary")
-    translated = ""
-    for char in input_string:
-        if char not in VOCABS[vocab_name]:
-            # we need to translate char into a vocab char
-            if char in string.whitespace:
-                # remove whitespaces
-                continue
-            # normalize character if it is not in vocab
-            char = unicodedata.normalize("NFD", char).encode("ascii", "ignore").decode("ascii")
-            if char == "" or char not in VOCABS[vocab_name]:
-                # if normalization fails or char still not in vocab, return unknown character)
-                char = unknown_char
-        translated += char
-    return translated
-def encode_string(
-    input_string: str,
-    vocab: str,
-) -> List[int]:
-    """Given a predefined mapping, encode the string to a sequence of numbers
-    Args:
-    ----
-        input_string: string to encode
-        vocab: vocabulary (string), the encoding is given by the indexing of the character sequence
-    Returns:
-    -------
-        A list encoding the input_string
-    """
-    try:
-        return list(map(vocab.index, input_string))
-    except ValueError:
-        raise ValueError(
-            f"some characters cannot be found in 'vocab'. \
-                         Please check the input string {input_string} and the vocabulary {vocab}"
-        )
-def decode_sequence(
-    input_seq: Union[np.ndarray, SequenceType[int]],
-    mapping: str,
-) -> str:
-    """Given a predefined mapping, decode the sequence of numbers to a string
-    Args:
-    ----
-        input_seq: array to decode
-        mapping: vocabulary (string), the encoding is given by the indexing of the character sequence
-    Returns:
-    -------
-        A string, decoded from input_seq
-    """
-    if not isinstance(input_seq, (Sequence, np.ndarray)):
-        raise TypeError("Invalid sequence type")
-    if isinstance(input_seq, np.ndarray) and (input_seq.dtype != np.int_ or input_seq.max() >= len(mapping)):
-        raise AssertionError("Input must be an array of int, with max less than mapping size")
-    return "".join(map(mapping.__getitem__, input_seq))
-def encode_sequences(
-    sequences: List[str],
-    vocab: str,
-    target_size: Optional[int] = None,
-    eos: int = -1,
-    sos: Optional[int] = None,
-    pad: Optional[int] = None,
-    dynamic_seq_length: bool = False,
-) -> np.ndarray:
-    """Encode character sequences using a given vocab as mapping
-    Args:
-    ----
-        sequences: the list of character sequences of size N
-        vocab: the ordered vocab to use for encoding
-        target_size: maximum length of the encoded data
-        eos: encoding of End Of String
-        sos: optional encoding of Start Of String
-        pad: optional encoding for padding. In case of padding, all sequences are followed by 1 EOS then PAD
-        dynamic_seq_length: if `target_size` is specified, uses it as upper bound and enables dynamic sequence size
-    Returns:
-    -------
-        the padded encoded data as a tensor
-    """
-    if 0 <= eos < len(vocab):
-        raise ValueError("argument 'eos' needs to be outside of vocab possible indices")
-    if not isinstance(target_size, int) or dynamic_seq_length:
-        # Maximum string length + EOS
-        max_length = max(len(w) for w in sequences) + 1
-        if isinstance(sos, int):
-            max_length += 1
-        if isinstance(pad, int):
-            max_length += 1
-        target_size = max_length if not isinstance(target_size, int) else min(max_length, target_size)
-    # Pad all sequences
-    if isinstance(pad, int):  # pad with padding symbol
-        if 0 <= pad < len(vocab):
-            raise ValueError("argument 'pad' needs to be outside of vocab possible indices")
-        # In that case, add EOS at the end of the word before padding
-        default_symbol = pad
-    else:  # pad with eos symbol
-        default_symbol = eos
-    encoded_data: np.ndarray = np.full([len(sequences), target_size], default_symbol, dtype=np.int32)
-    # Encode the strings
-    for idx, seq in enumerate(map(partial(encode_string, vocab=vocab), sequences)):
-        if isinstance(pad, int):  # add eos at the end of the sequence
-            seq.append(eos)
-        encoded_data[idx, : min(len(seq), target_size)] = seq[: min(len(seq), target_size)]
-    if isinstance(sos, int):  # place sos symbol at the beginning of each sequence
-        if 0 <= sos < len(vocab):
-            raise ValueError("argument 'sos' needs to be outside of vocab possible indices")
-        encoded_data = np.roll(encoded_data, 1)
-        encoded_data[:, 0] = sos
-    return encoded_data
-def convert_target_to_relative(img: ImageTensor, target: Dict[str, Any]) -> Tuple[ImageTensor, Dict[str, Any]]:
-    target["boxes"] = convert_to_relative_coords(target["boxes"], get_img_shape(img))
-    return img, target
-def crop_bboxes_from_image(img_path: Union[str, Path], geoms: np.ndarray) -> List[np.ndarray]:
-    """Crop a set of bounding boxes from an image
-    Args:
-    ----
-        img_path: path to the image
-        geoms: a array of polygons of shape (N, 4, 2) or of straight boxes of shape (N, 4)
-    Returns:
-    -------
-        a list of cropped images
-    """
-    img: np.ndarray = np.array(Image.open(img_path).convert("RGB"))
-    # Polygon
-    if geoms.ndim == 3 and geoms.shape[1:] == (4, 2):
-        return extract_rcrops(img, geoms.astype(dtype=int))
-    if geoms.ndim == 2 and geoms.shape[1] == 4:
-        return extract_crops(img, geoms.astype(dtype=int))
-    raise ValueError("Invalid geometry format")
-def pre_transform_multiclass(img, target: Tuple[np.ndarray, List]) -> Tuple[np.ndarray, Dict[str, List]]:
-    """Converts multiclass target to relative coordinates.
-    Args:
-    ----
-        img: Image
-        target: tuple of target polygons and their classes names
-    Returns:
-    -------
-        Image and dictionary of boxes, with class names as keys
-    """
-    boxes = convert_to_relative_coords(target[0], get_img_shape(img))
-    boxes_classes = target[1]
-    boxes_dict: Dict = {k: [] for k in sorted(set(boxes_classes))}
-    for k, poly in zip(boxes_classes, boxes):
-        boxes_dict[k].append(poly)
-    boxes_dict = {k: np.stack(v, axis=0) for k, v in boxes_dict.items()}
-    return img, boxes_dict

doctr/datasets/vocabs.py DELETED Viewed

@@ -1,71 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import string
-from typing import Dict
-__all__ = ["VOCABS"]
-VOCABS: Dict[str, str] = {
-    "digits": string.digits,
-    "ascii_letters": string.ascii_letters,
-    "punctuation": string.punctuation,
-    "currency": "£€¥¢฿",
-    "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ",
-    "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي",
-    "persian_letters": "پچڢڤگ",
-    "hindi_digits": "٠١٢٣٤٥٦٧٨٩",
-    "arabic_diacritics": "ًٌٍَُِّْ",
-    "arabic_punctuation": "؟؛«»—",
-}
-VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
-VOCABS["english"] = VOCABS["latin"] + "°" + VOCABS["currency"]
-VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"]
-VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ"
-VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ"
-VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿"
-VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ"
-VOCABS["german"] = VOCABS["english"] + "äöüßÄÖÜẞ"
-VOCABS["arabic"] = (
-    VOCABS["digits"]
-    + VOCABS["hindi_digits"]
-    + VOCABS["arabic_letters"]
-    + VOCABS["persian_letters"]
-    + VOCABS["arabic_diacritics"]
-    + VOCABS["arabic_punctuation"]
-    + VOCABS["punctuation"]
-)
-VOCABS["czech"] = VOCABS["english"] + "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"
-VOCABS["polish"] = VOCABS["english"] + "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"
-VOCABS["dutch"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ"
-VOCABS["norwegian"] = VOCABS["english"] + "æøåÆØÅ"
-VOCABS["danish"] = VOCABS["english"] + "æøåÆØÅ"
-VOCABS["finnish"] = VOCABS["english"] + "äöÄÖ"
-VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ"
-VOCABS["vietnamese"] = (
-    VOCABS["english"]
-    + "áàảạãăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ"
-    + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ"
-)
-VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
-VOCABS["multilingual"] = "".join(
-    dict.fromkeys(
-        VOCABS["french"]
-        + VOCABS["portuguese"]
-        + VOCABS["spanish"]
-        + VOCABS["german"]
-        + VOCABS["czech"]
-        + VOCABS["polish"]
-        + VOCABS["dutch"]
-        + VOCABS["italian"]
-        + VOCABS["norwegian"]
-        + VOCABS["danish"]
-        + VOCABS["finnish"]
-        + VOCABS["swedish"]
-        + "§"
-    )
-)

doctr/datasets/wildreceipt.py DELETED Viewed

@@ -1,111 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
-import numpy as np
-from .datasets import AbstractDataset
-from .utils import convert_target_to_relative, crop_bboxes_from_image
-__all__ = ["WILDRECEIPT"]
-class WILDRECEIPT(AbstractDataset):
-    """WildReceipt dataset from `"Spatial Dual-Modality Graph Reasoning for Key Information Extraction"
-        <https://arxiv.org/abs/2103.14470v1>`_ |
-    `repository <https://download.openmmlab.com/mmocr/data/wildreceipt.tar>`_.
-    .. image:: https://doctr-static.mindee.com/models?id=v0.7.0/wildreceipt-dataset.jpg&src=0
-        :align: center
-    >>> # NOTE: You need to download the dataset first.
-    >>> from doctr.datasets import WILDRECEIPT
-    >>> train_set = WILDRECEIPT(train=True, img_folder="/path/to/wildreceipt/",
-    >>>                     label_path="/path/to/wildreceipt/train.txt")
-    >>> img, target = train_set[0]
-    >>> test_set = WILDRECEIPT(train=False, img_folder="/path/to/wildreceipt/",
-    >>>                    label_path="/path/to/wildreceipt/test.txt")
-    >>> img, target = test_set[0]
-    Args:
-    ----
-        img_folder: folder with all the images of the dataset
-        label_path: path to the annotations file of the dataset
-        train: whether the subset should be the training one
-        use_polygons: whether polygons should be considered as rotated bounding box (instead of straight ones)
-        recognition_task: whether the dataset should be used for recognition task
-        **kwargs: keyword arguments from `AbstractDataset`.
-    """
-    def __init__(
-        self,
-        img_folder: str,
-        label_path: str,
-        train: bool = True,
-        use_polygons: bool = False,
-        recognition_task: bool = False,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__(
-            img_folder, pre_transforms=convert_target_to_relative if not recognition_task else None, **kwargs
-        )
-        # File existence check
-        if not os.path.exists(label_path) or not os.path.exists(img_folder):
-            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
-        tmp_root = img_folder
-        self.train = train
-        np_dtype = np.float32
-        self.data: List[Tuple[Union[str, Path, np.ndarray], Union[str, Dict[str, Any]]]] = []
-        with open(label_path, "r") as file:
-            data = file.read()
-        # Split the text file into separate JSON strings
-        json_strings = data.strip().split("\n")
-        box: Union[List[float], np.ndarray]
-        _targets = []
-        for json_string in json_strings:
-            json_data = json.loads(json_string)
-            img_path = json_data["file_name"]
-            annotations = json_data["annotations"]
-            for annotation in annotations:
-                coordinates = annotation["box"]
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box = np.array(
-                        [
-                            [coordinates[0], coordinates[1]],
-                            [coordinates[2], coordinates[3]],
-                            [coordinates[4], coordinates[5]],
-                            [coordinates[6], coordinates[7]],
-                        ],
-                        dtype=np_dtype,
-                    )
-                else:
-                    x, y = coordinates[::2], coordinates[1::2]
-                    box = [min(x), min(y), max(x), max(y)]
-                _targets.append((annotation["text"], box))
-            text_targets, box_targets = zip(*_targets)
-            if recognition_task:
-                crops = crop_bboxes_from_image(
-                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
-                )
-                for crop, label in zip(crops, list(text_targets)):
-                    if label and " " not in label:
-                        self.data.append((crop, label))
-            else:
-                self.data.append((
-                    img_path,
-                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
-                ))
-        self.root = tmp_root
-    def extra_repr(self) -> str:
-        return f"train={self.train}"

doctr/file_utils.py DELETED Viewed

@@ -1,92 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-# Adapted from https://github.com/huggingface/transformers/blob/master/src/transformers/file_utils.py
-import importlib.util
-import logging
-import os
-import sys
-CLASS_NAME: str = "words"
-if sys.version_info < (3, 8):  # pragma: no cover
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-__all__ = ["is_tf_available", "is_torch_available", "CLASS_NAME"]
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
-USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logging.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:  # pragma: no cover
-            _torch_available = False
-else:  # pragma: no cover
-    logging.info("Disabling PyTorch because USE_TF is set")
-    _torch_available = False
-if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-    _tf_available = importlib.util.find_spec("tensorflow") is not None
-    if _tf_available:
-        candidates = (
-            "tensorflow",
-            "tensorflow-cpu",
-            "tensorflow-gpu",
-            "tf-nightly",
-            "tf-nightly-cpu",
-            "tf-nightly-gpu",
-            "intel-tensorflow",
-            "tensorflow-rocm",
-            "tensorflow-macos",
-        )
-        _tf_version = None
-        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
-        for pkg in candidates:
-            try:
-                _tf_version = importlib_metadata.version(pkg)
-                break
-            except importlib_metadata.PackageNotFoundError:
-                pass
-        _tf_available = _tf_version is not None
-    if _tf_available:
-        if int(_tf_version.split(".")[0]) < 2:  # type: ignore[union-attr]  # pragma: no cover
-            logging.info(f"TensorFlow found but with version {_tf_version}. DocTR requires version 2 minimum.")
-            _tf_available = False
-        else:
-            logging.info(f"TensorFlow version {_tf_version} available.")
-else:  # pragma: no cover
-    logging.info("Disabling Tensorflow because USE_TORCH is set")
-    _tf_available = False
-if not _torch_available and not _tf_available:  # pragma: no cover
-    raise ModuleNotFoundError(
-        "DocTR requires either TensorFlow or PyTorch to be installed. Please ensure one of them"
-        " is installed and that either USE_TF or USE_TORCH is enabled."
-    )
-def is_torch_available():
-    """Whether PyTorch is installed."""
-    return _torch_available
-def is_tf_available():
-    """Whether TensorFlow is installed."""
-    return _tf_available

doctr/io/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-from .elements import *
-from .html import *
-from .image import *
-from .pdf import *
-from .reader import *

doctr/io/elements.py DELETED Viewed

@@ -1,621 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import Any, Dict, List, Optional, Tuple, Union
-from defusedxml import defuse_stdlib
-defuse_stdlib()
-from xml.etree import ElementTree as ET
-from xml.etree.ElementTree import Element as ETElement
-from xml.etree.ElementTree import SubElement
-import matplotlib.pyplot as plt
-import numpy as np
-import doctr
-from doctr.utils.common_types import BoundingBox
-from doctr.utils.geometry import resolve_enclosing_bbox, resolve_enclosing_rbbox
-from doctr.utils.repr import NestedObject
-from doctr.utils.visualization import synthesize_kie_page, synthesize_page, visualize_kie_page, visualize_page
-__all__ = ["Element", "Word", "Artefact", "Line", "Prediction", "Block", "Page", "KIEPage", "Document"]
-class Element(NestedObject):
-    """Implements an abstract document element with exporting and text rendering capabilities"""
-    _children_names: List[str] = []
-    _exported_keys: List[str] = []
-    def __init__(self, **kwargs: Any) -> None:
-        for k, v in kwargs.items():
-            if k in self._children_names:
-                setattr(self, k, v)
-            else:
-                raise KeyError(f"{self.__class__.__name__} object does not have any attribute named '{k}'")
-    def export(self) -> Dict[str, Any]:
-        """Exports the object into a nested dict format"""
-        export_dict = {k: getattr(self, k) for k in self._exported_keys}
-        for children_name in self._children_names:
-            if children_name in ["predictions"]:
-                export_dict[children_name] = {
-                    k: [item.export() for item in c] for k, c in getattr(self, children_name).items()
-                }
-            else:
-                export_dict[children_name] = [c.export() for c in getattr(self, children_name)]
-        return export_dict
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        raise NotImplementedError
-    def render(self) -> str:
-        raise NotImplementedError
-class Word(Element):
-    """Implements a word element
-    Args:
-    ----
-        value: the text string of the word
-        confidence: the confidence associated with the text prediction
-        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
-        the page's size
-        crop_orientation: the general orientation of the crop in degrees and its confidence
-    """
-    _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
-    _children_names: List[str] = []
-    def __init__(
-        self,
-        value: str,
-        confidence: float,
-        geometry: Union[BoundingBox, np.ndarray],
-        crop_orientation: Dict[str, Any],
-    ) -> None:
-        super().__init__()
-        self.value = value
-        self.confidence = confidence
-        self.geometry = geometry
-        self.crop_orientation = crop_orientation
-    def render(self) -> str:
-        """Renders the full text of the element"""
-        return self.value
-    def extra_repr(self) -> str:
-        return f"value='{self.value}', confidence={self.confidence:.2}"
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        return cls(**kwargs)
-class Artefact(Element):
-    """Implements a non-textual element
-    Args:
-    ----
-        artefact_type: the type of artefact
-        confidence: the confidence of the type prediction
-        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
-            the page's size.
-    """
-    _exported_keys: List[str] = ["geometry", "type", "confidence"]
-    _children_names: List[str] = []
-    def __init__(self, artefact_type: str, confidence: float, geometry: BoundingBox) -> None:
-        super().__init__()
-        self.geometry = geometry
-        self.type = artefact_type
-        self.confidence = confidence
-    def render(self) -> str:
-        """Renders the full text of the element"""
-        return f"[{self.type.upper()}]"
-    def extra_repr(self) -> str:
-        return f"type='{self.type}', confidence={self.confidence:.2}"
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        return cls(**kwargs)
-class Line(Element):
-    """Implements a line element as a collection of words
-    Args:
-    ----
-        words: list of word elements
-        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
-            the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
-            all words in it.
-    """
-    _exported_keys: List[str] = ["geometry"]
-    _children_names: List[str] = ["words"]
-    words: List[Word] = []
-    def __init__(
-        self,
-        words: List[Word],
-        geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
-    ) -> None:
-        # Resolve the geometry using the smallest enclosing bounding box
-        if geometry is None:
-            # Check whether this is a rotated or straight box
-            box_resolution_fn = resolve_enclosing_rbbox if len(words[0].geometry) == 4 else resolve_enclosing_bbox
-            geometry = box_resolution_fn([w.geometry for w in words])  # type: ignore[operator]
-        super().__init__(words=words)
-        self.geometry = geometry
-    def render(self) -> str:
-        """Renders the full text of the element"""
-        return " ".join(w.render() for w in self.words)
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        kwargs.update({
-            "words": [Word.from_dict(_dict) for _dict in save_dict["words"]],
-        })
-        return cls(**kwargs)
-class Prediction(Word):
-    """Implements a prediction element"""
-    def render(self) -> str:
-        """Renders the full text of the element"""
-        return self.value
-    def extra_repr(self) -> str:
-        return f"value='{self.value}', confidence={self.confidence:.2}, bounding_box={self.geometry}"
-class Block(Element):
-    """Implements a block element as a collection of lines and artefacts
-    Args:
-    ----
-        lines: list of line elements
-        artefacts: list of artefacts
-        geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
-            the page's size. If not specified, it will be resolved by default to the smallest bounding box enclosing
-            all lines and artefacts in it.
-    """
-    _exported_keys: List[str] = ["geometry"]
-    _children_names: List[str] = ["lines", "artefacts"]
-    lines: List[Line] = []
-    artefacts: List[Artefact] = []
-    def __init__(
-        self,
-        lines: List[Line] = [],
-        artefacts: List[Artefact] = [],
-        geometry: Optional[Union[BoundingBox, np.ndarray]] = None,
-    ) -> None:
-        # Resolve the geometry using the smallest enclosing bounding box
-        if geometry is None:
-            line_boxes = [word.geometry for line in lines for word in line.words]
-            artefact_boxes = [artefact.geometry for artefact in artefacts]
-            box_resolution_fn = (
-                resolve_enclosing_rbbox if isinstance(lines[0].geometry, np.ndarray) else resolve_enclosing_bbox
-            )
-            geometry = box_resolution_fn(line_boxes + artefact_boxes)  # type: ignore[operator]
-        super().__init__(lines=lines, artefacts=artefacts)
-        self.geometry = geometry
-    def render(self, line_break: str = "\n") -> str:
-        """Renders the full text of the element"""
-        return line_break.join(line.render() for line in self.lines)
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        kwargs.update({
-            "lines": [Line.from_dict(_dict) for _dict in save_dict["lines"]],
-            "artefacts": [Artefact.from_dict(_dict) for _dict in save_dict["artefacts"]],
-        })
-        return cls(**kwargs)
-class Page(Element):
-    """Implements a page element as a collection of blocks
-    Args:
-    ----
-        page: image encoded as a numpy array in uint8
-        blocks: list of block elements
-        page_idx: the index of the page in the input raw document
-        dimensions: the page size in pixels in format (height, width)
-        orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
-        language: a dictionary with the language value and confidence of the prediction
-    """
-    _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
-    _children_names: List[str] = ["blocks"]
-    blocks: List[Block] = []
-    def __init__(
-        self,
-        page: np.ndarray,
-        blocks: List[Block],
-        page_idx: int,
-        dimensions: Tuple[int, int],
-        orientation: Optional[Dict[str, Any]] = None,
-        language: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(blocks=blocks)
-        self.page = page
-        self.page_idx = page_idx
-        self.dimensions = dimensions
-        self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
-        self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
-    def render(self, block_break: str = "\n\n") -> str:
-        """Renders the full text of the element"""
-        return block_break.join(b.render() for b in self.blocks)
-    def extra_repr(self) -> str:
-        return f"dimensions={self.dimensions}"
-    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
-        """Overlay the result on a given image
-        Args:
-            interactive: whether the display should be interactive
-            preserve_aspect_ratio: pass True if you passed True to the predictor
-            **kwargs: additional keyword arguments passed to the matplotlib.pyplot.show method
-        """
-        visualize_page(self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio)
-        plt.show(**kwargs)
-    def synthesize(self, **kwargs) -> np.ndarray:
-        """Synthesize the page from the predictions
-        Returns
-        -------
-            synthesized page
-        """
-        return synthesize_page(self.export(), **kwargs)
-    def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
-        """Export the page as XML (hOCR-format)
-        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
-        Args:
-        ----
-            file_title: the title of the XML file
-        Returns:
-        -------
-            a tuple of the XML byte string, and its ElementTree
-        """
-        p_idx = self.page_idx
-        block_count: int = 1
-        line_count: int = 1
-        word_count: int = 1
-        height, width = self.dimensions
-        language = self.language if "language" in self.language.keys() else "en"
-        # Create the XML root element
-        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
-        # Create the header / SubElements of the root element
-        head = SubElement(page_hocr, "head")
-        SubElement(head, "title").text = file_title
-        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
-        SubElement(
-            head,
-            "meta",
-            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
-        )
-        SubElement(
-            head,
-            "meta",
-            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
-        )
-        # Create the body
-        body = SubElement(page_hocr, "body")
-        SubElement(
-            body,
-            "div",
-            attrib={
-                "class": "ocr_page",
-                "id": f"page_{p_idx + 1}",
-                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
-            },
-        )
-        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
-        for block in self.blocks:
-            if len(block.geometry) != 2:
-                raise TypeError("XML export is only available for straight bounding boxes for now.")
-            (xmin, ymin), (xmax, ymax) = block.geometry
-            block_div = SubElement(
-                body,
-                "div",
-                attrib={
-                    "class": "ocr_carea",
-                    "id": f"block_{block_count}",
-                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
-                    {int(round(xmax * width))} {int(round(ymax * height))}",
-                },
-            )
-            paragraph = SubElement(
-                block_div,
-                "p",
-                attrib={
-                    "class": "ocr_par",
-                    "id": f"par_{block_count}",
-                    "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
-                    {int(round(xmax * width))} {int(round(ymax * height))}",
-                },
-            )
-            block_count += 1
-            for line in block.lines:
-                (xmin, ymin), (xmax, ymax) = line.geometry
-                # NOTE: baseline, x_size, x_descenders, x_ascenders is currently initalized to 0
-                line_span = SubElement(
-                    paragraph,
-                    "span",
-                    attrib={
-                        "class": "ocr_line",
-                        "id": f"line_{line_count}",
-                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
-                        {int(round(xmax * width))} {int(round(ymax * height))}; \
-                        baseline 0 0; x_size 0; x_descenders 0; x_ascenders 0",
-                    },
-                )
-                line_count += 1
-                for word in line.words:
-                    (xmin, ymin), (xmax, ymax) = word.geometry
-                    conf = word.confidence
-                    word_div = SubElement(
-                        line_span,
-                        "span",
-                        attrib={
-                            "class": "ocrx_word",
-                            "id": f"word_{word_count}",
-                            "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
-                            {int(round(xmax * width))} {int(round(ymax * height))}; \
-                            x_wconf {int(round(conf * 100))}",
-                        },
-                    )
-                    # set the text
-                    word_div.text = word.value
-                    word_count += 1
-        return (ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr))
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        kwargs.update({"blocks": [Block.from_dict(block_dict) for block_dict in save_dict["blocks"]]})
-        return cls(**kwargs)
-class KIEPage(Element):
-    """Implements a KIE page element as a collection of predictions
-    Args:
-    ----
-        predictions: Dictionary with list of block elements for each detection class
-        page: image encoded as a numpy array in uint8
-        page_idx: the index of the page in the input raw document
-        dimensions: the page size in pixels in format (height, width)
-        orientation: a dictionary with the value of the rotation angle in degress and confidence of the prediction
-        language: a dictionary with the language value and confidence of the prediction
-    """
-    _exported_keys: List[str] = ["page_idx", "dimensions", "orientation", "language"]
-    _children_names: List[str] = ["predictions"]
-    predictions: Dict[str, List[Prediction]] = {}
-    def __init__(
-        self,
-        page: np.ndarray,
-        predictions: Dict[str, List[Prediction]],
-        page_idx: int,
-        dimensions: Tuple[int, int],
-        orientation: Optional[Dict[str, Any]] = None,
-        language: Optional[Dict[str, Any]] = None,
-    ) -> None:
-        super().__init__(predictions=predictions)
-        self.page = page
-        self.page_idx = page_idx
-        self.dimensions = dimensions
-        self.orientation = orientation if isinstance(orientation, dict) else dict(value=None, confidence=None)
-        self.language = language if isinstance(language, dict) else dict(value=None, confidence=None)
-    def render(self, prediction_break: str = "\n\n") -> str:
-        """Renders the full text of the element"""
-        return prediction_break.join(
-            f"{class_name}: {p.render()}" for class_name, predictions in self.predictions.items() for p in predictions
-        )
-    def extra_repr(self) -> str:
-        return f"dimensions={self.dimensions}"
-    def show(self, interactive: bool = True, preserve_aspect_ratio: bool = False, **kwargs) -> None:
-        """Overlay the result on a given image
-        Args:
-            interactive: whether the display should be interactive
-            preserve_aspect_ratio: pass True if you passed True to the predictor
-            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
-        """
-        visualize_kie_page(
-            self.export(), self.page, interactive=interactive, preserve_aspect_ratio=preserve_aspect_ratio
-        )
-        plt.show(**kwargs)
-    def synthesize(self, **kwargs) -> np.ndarray:
-        """Synthesize the page from the predictions
-        Args:
-        ----
-            **kwargs: keyword arguments passed to the matplotlib.pyplot.show method
-        Returns:
-        -------
-            synthesized page
-        """
-        return synthesize_kie_page(self.export(), **kwargs)
-    def export_as_xml(self, file_title: str = "docTR - XML export (hOCR)") -> Tuple[bytes, ET.ElementTree]:
-        """Export the page as XML (hOCR-format)
-        convention: https://github.com/kba/hocr-spec/blob/master/1.2/spec.md
-        Args:
-        ----
-            file_title: the title of the XML file
-        Returns:
-        -------
-            a tuple of the XML byte string, and its ElementTree
-        """
-        p_idx = self.page_idx
-        prediction_count: int = 1
-        height, width = self.dimensions
-        language = self.language if "language" in self.language.keys() else "en"
-        # Create the XML root element
-        page_hocr = ETElement("html", attrib={"xmlns": "http://www.w3.org/1999/xhtml", "xml:lang": str(language)})
-        # Create the header / SubElements of the root element
-        head = SubElement(page_hocr, "head")
-        SubElement(head, "title").text = file_title
-        SubElement(head, "meta", attrib={"http-equiv": "Content-Type", "content": "text/html; charset=utf-8"})
-        SubElement(
-            head,
-            "meta",
-            attrib={"name": "ocr-system", "content": f"python-doctr {doctr.__version__}"},  # type: ignore[attr-defined]
-        )
-        SubElement(
-            head,
-            "meta",
-            attrib={"name": "ocr-capabilities", "content": "ocr_page ocr_carea ocr_par ocr_line ocrx_word"},
-        )
-        # Create the body
-        body = SubElement(page_hocr, "body")
-        SubElement(
-            body,
-            "div",
-            attrib={
-                "class": "ocr_page",
-                "id": f"page_{p_idx + 1}",
-                "title": f"image; bbox 0 0 {width} {height}; ppageno 0",
-            },
-        )
-        # iterate over the blocks / lines / words and create the XML elements in body line by line with the attributes
-        for class_name, predictions in self.predictions.items():
-            for prediction in predictions:
-                if len(prediction.geometry) != 2:
-                    raise TypeError("XML export is only available for straight bounding boxes for now.")
-                (xmin, ymin), (xmax, ymax) = prediction.geometry
-                prediction_div = SubElement(
-                    body,
-                    "div",
-                    attrib={
-                        "class": "ocr_carea",
-                        "id": f"{class_name}_prediction_{prediction_count}",
-                        "title": f"bbox {int(round(xmin * width))} {int(round(ymin * height))} \
-                        {int(round(xmax * width))} {int(round(ymax * height))}",
-                    },
-                )
-                prediction_div.text = prediction.value
-                prediction_count += 1
-        return ET.tostring(page_hocr, encoding="utf-8", method="xml"), ET.ElementTree(page_hocr)
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        kwargs.update({
-            "predictions": [Prediction.from_dict(predictions_dict) for predictions_dict in save_dict["predictions"]]
-        })
-        return cls(**kwargs)
-class Document(Element):
-    """Implements a document element as a collection of pages
-    Args:
-    ----
-        pages: list of page elements
-    """
-    _children_names: List[str] = ["pages"]
-    pages: List[Page] = []
-    def __init__(
-        self,
-        pages: List[Page],
-    ) -> None:
-        super().__init__(pages=pages)
-    def render(self, page_break: str = "\n\n\n\n") -> str:
-        """Renders the full text of the element"""
-        return page_break.join(p.render() for p in self.pages)
-    def show(self, **kwargs) -> None:
-        """Overlay the result on a given image"""
-        for result in self.pages:
-            result.show(**kwargs)
-    def synthesize(self, **kwargs) -> List[np.ndarray]:
-        """Synthesize all pages from their predictions
-        Returns
-        -------
-            list of synthesized pages
-        """
-        return [page.synthesize() for page in self.pages]
-    def export_as_xml(self, **kwargs) -> List[Tuple[bytes, ET.ElementTree]]:
-        """Export the document as XML (hOCR-format)
-        Args:
-        ----
-            **kwargs: additional keyword arguments passed to the Page.export_as_xml method
-        Returns:
-        -------
-            list of tuple of (bytes, ElementTree)
-        """
-        return [page.export_as_xml(**kwargs) for page in self.pages]
-    @classmethod
-    def from_dict(cls, save_dict: Dict[str, Any], **kwargs):
-        kwargs = {k: save_dict[k] for k in cls._exported_keys}
-        kwargs.update({"pages": [Page.from_dict(page_dict) for page_dict in save_dict["pages"]]})
-        return cls(**kwargs)
-class KIEDocument(Document):
-    """Implements a document element as a collection of pages
-    Args:
-    ----
-        pages: list of page elements
-    """
-    _children_names: List[str] = ["pages"]
-    pages: List[KIEPage] = []  # type: ignore[assignment]
-    def __init__(
-        self,
-        pages: List[KIEPage],
-    ) -> None:
-        super().__init__(pages=pages)  # type: ignore[arg-type]

doctr/io/html.py DELETED Viewed

@@ -1,28 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from typing import Any
-from weasyprint import HTML
-__all__ = ["read_html"]
-def read_html(url: str, **kwargs: Any) -> bytes:
-    """Read a PDF file and convert it into an image in numpy format
-    >>> from doctr.io import read_html
-    >>> doc = read_html("https://www.yoursite.com")
-    Args:
-    ----
-        url: URL of the target web page
-        **kwargs: keyword arguments from `weasyprint.HTML`
-    Returns:
-    -------
-        decoded PDF file as a bytes stream
-    """
-    return HTML(url, **kwargs).write_pdf()

doctr/io/image/__init__.py DELETED Viewed

@@ -1,8 +0,0 @@
-from doctr.file_utils import is_tf_available, is_torch_available
-from .base import *
-if is_tf_available():
-    from .tensorflow import *
-elif is_torch_available():
-    from .pytorch import *

doctr/io/image/base.py DELETED Viewed

@@ -1,56 +0,0 @@
-# Copyright (C) 2021-2024, Mindee.
-# This program is licensed under the Apache License 2.0.
-# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
-from pathlib import Path
-from typing import Optional, Tuple
-import cv2
-import numpy as np
-from doctr.utils.common_types import AbstractFile
-__all__ = ["read_img_as_numpy"]
-def read_img_as_numpy(
-    file: AbstractFile,
-    output_size: Optional[Tuple[int, int]] = None,
-    rgb_output: bool = True,
-) -> np.ndarray:
-    """Read an image file into numpy format
-    >>> from doctr.io import read_img_as_numpy
-    >>> page = read_img_as_numpy("path/to/your/doc.jpg")
-    Args:
-    ----
-        file: the path to the image file
-        output_size: the expected output size of each page in format H x W
-        rgb_output: whether the output ndarray channel order should be RGB instead of BGR.
-    Returns:
-    -------
-        the page decoded as numpy ndarray of shape H x W x 3
-    """
-    if isinstance(file, (str, Path)):
-        if not Path(file).is_file():
-            raise FileNotFoundError(f"unable to access {file}")
-        img = cv2.imread(str(file), cv2.IMREAD_COLOR)
-    elif isinstance(file, bytes):
-        _file: np.ndarray = np.frombuffer(file, np.uint8)
-        img = cv2.imdecode(_file, cv2.IMREAD_COLOR)
-    else:
-        raise TypeError("unsupported object type for argument 'file'")
-    # Validity check
-    if img is None:
-        raise ValueError("unable to read file.")
-    # Resizing
-    if isinstance(output_size, tuple):
-        img = cv2.resize(img, output_size[::-1], interpolation=cv2.INTER_LINEAR)
-    # Switch the channel order
-    if rgb_output:
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-    return img