Upload 108 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- Dockerfile +50 -0
- pyproject.toml +262 -0
- src/pdf2u/__init__.py +1 -0
- src/pdf2u/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/const.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/const.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/converter.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/converter.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/high_level.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/high_level.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/io.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/main.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/main.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc +0 -0
- src/pdf2u/__pycache__/translation_config.cpython-311.pyc +0 -0
- src/pdf2u/__pycache__/translation_config.cpython-312.pyc +0 -0
- src/pdf2u/asynchronize/__init__.py +51 -0
- src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/const.py +14 -0
- src/pdf2u/converter.py +493 -0
- src/pdf2u/document_il/__init__.py +45 -0
- src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc +0 -0
- src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc +0 -0
- src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/__init__.py +0 -0
- src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc +0 -0
- src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc +0 -0
- src/pdf2u/document_il/backend/pdf_creater.py +405 -0
- src/pdf2u/document_il/frontend/__init__.py +0 -0
- src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc +0 -0
- src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc +0 -0
- src/pdf2u/document_il/frontend/il_creater.py +328 -0
- src/pdf2u/document_il/il_version_1.py +396 -0
- src/pdf2u/document_il/il_version_1.rnc +141 -0
- src/pdf2u/document_il/il_version_1.rng +390 -0
- src/pdf2u/document_il/il_version_1.xsd +235 -0
- src/pdf2u/document_il/midend/__init__.py +0 -0
- src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ref: https://github.com/fastapi/full-stack-fastapi-template/blob/master/backend/Dockerfile
|
| 2 |
+
FROM python:3.12-slim-bookworm
|
| 3 |
+
|
| 4 |
+
# Print logs immediately
|
| 5 |
+
# Ref: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONUNBUFFERED
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
+
|
| 8 |
+
# Install system dependencies including OpenGL libraries
|
| 9 |
+
RUN apt-get update && apt-get install -y \
|
| 10 |
+
libgl1-mesa-glx \
|
| 11 |
+
libglib2.0-0 \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Change the working directory to the `app` directory
|
| 15 |
+
WORKDIR /app
|
| 16 |
+
|
| 17 |
+
# Install uv
|
| 18 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#installing-uv
|
| 19 |
+
COPY --from=ghcr.io/astral-sh/uv:0.5.18 /uv /uvx /bin/
|
| 20 |
+
|
| 21 |
+
# Place executables in the environment at the front of the path
|
| 22 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#using-the-environment
|
| 23 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 24 |
+
|
| 25 |
+
# Compile bytecode to speed up the startup time
|
| 26 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#compiling-bytecode
|
| 27 |
+
ENV UV_COMPILE_BYTECODE=1
|
| 28 |
+
|
| 29 |
+
# uv Cache
|
| 30 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#caching
|
| 31 |
+
ENV UV_LINK_MODE=copy
|
| 32 |
+
|
| 33 |
+
# Install dependencies
|
| 34 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
| 35 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 36 |
+
--mount=type=bind,source=uv.lock,target=uv.lock \
|
| 37 |
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
| 38 |
+
uv sync --frozen --no-install-project
|
| 39 |
+
|
| 40 |
+
# Copy the project into the image
|
| 41 |
+
COPY . .
|
| 42 |
+
|
| 43 |
+
# Sync the project
|
| 44 |
+
# Ref: https://docs.astral.sh/uv/guides/integration/docker/#intermediate-layers
|
| 45 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 46 |
+
uv sync --all-extras
|
| 47 |
+
|
| 48 |
+
EXPOSE 8501
|
| 49 |
+
# Set the default command
|
| 50 |
+
CMD ["streamlit", "run", "src/pdf2u/gui.py"]
|
pyproject.toml
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[tool.hatch.version]
|
| 6 |
+
path = "src/pdf2u/__init__.py"
|
| 7 |
+
# FROM: https://hatch.pypa.io/latest/version/
|
| 8 |
+
|
| 9 |
+
[tool.hatch.build.targets.wheel]
|
| 10 |
+
packages = ["src/pdf2u"]
|
| 11 |
+
# FROM: https://hatch.pypa.io/latest/build/
|
| 12 |
+
|
| 13 |
+
[project]
|
| 14 |
+
name = "pdf2u"
|
| 15 |
+
version = "0.0.4"
|
| 16 |
+
description = "Yet Another Document Translator"
|
| 17 |
+
classifiers = [
|
| 18 |
+
"License :: OSI Approved :: MIT License",
|
| 19 |
+
"Programming Language :: Python",
|
| 20 |
+
"Programming Language :: Python :: 3 :: Only",
|
| 21 |
+
"Programming Language :: Python :: 3.10",
|
| 22 |
+
"Programming Language :: Python :: 3.11",
|
| 23 |
+
"Programming Language :: Python :: 3.11",
|
| 24 |
+
"Programming Language :: Python :: 3.12",
|
| 25 |
+
"Programming Language :: Python :: 3.13",
|
| 26 |
+
] # FROM: https://pypi.org/classifiers/
|
| 27 |
+
readme = "README.md"
|
| 28 |
+
requires-python = ">=3.10,<3.13"
|
| 29 |
+
license = { file = "LICENSE" }
|
| 30 |
+
authors = [{ name = "A.J.Zeller", email = "hello@atticux.me" }]
|
| 31 |
+
maintainers = [{ name = "A.J.Zeller", email = "hello@atticux.me" }]
|
| 32 |
+
# dynamic = ["version"] # https://hatch.pypa.io/latest/config/metadata/#version
|
| 33 |
+
dependencies = [
|
| 34 |
+
"bitstring>=4.3.0",
|
| 35 |
+
"configargparse>=1.7",
|
| 36 |
+
"httpx[socks]>=0.27.0",
|
| 37 |
+
"huggingface-hub>=0.27.0",
|
| 38 |
+
"numpy>=2.0.2",
|
| 39 |
+
"onnx>=1.17.0",
|
| 40 |
+
"onnxruntime>=1.16.1",
|
| 41 |
+
"openai>=1.59.3",
|
| 42 |
+
"opencv-python>=4.10.0.84",
|
| 43 |
+
"orjson>=3.10.14",
|
| 44 |
+
"pdfminer-six>=20240706",
|
| 45 |
+
"peewee>=3.17.8",
|
| 46 |
+
"rich>=13.9.4",
|
| 47 |
+
"toml>=0.10.2",
|
| 48 |
+
"tqdm>=4.67.1",
|
| 49 |
+
"xsdata[cli,lxml,soap]>=24.12",
|
| 50 |
+
"msgpack>=1.1.0",
|
| 51 |
+
"typer>=0.15.1",
|
| 52 |
+
"pymupdf==1.24.5",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
[project.urls]
|
| 56 |
+
Homepage = "https://github.com/atticuszeller/pdf2u"
|
| 57 |
+
Issues = "https://github.com/atticuszeller/pdf2u/issues"
|
| 58 |
+
|
| 59 |
+
[project.scripts] # build-backend config needed
|
| 60 |
+
pdf2u = "pdf2u.main:app"
|
| 61 |
+
# FROM: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/
|
| 62 |
+
|
| 63 |
+
[project.optional-dependencies]
|
| 64 |
+
gui = ["pypdf2>=3.0.1", "streamlit>=1.42.2", "streamlit-pdf-viewer>=0.0.21"]
|
| 65 |
+
# optional deps for package installation
|
| 66 |
+
|
| 67 |
+
[dependency-groups]
|
| 68 |
+
dev = [
|
| 69 |
+
"ruff>=0.6.3",
|
| 70 |
+
"mypy>=1.11.2",
|
| 71 |
+
"pre-commit>=3.8.0",
|
| 72 |
+
"pytest>=8.3.2",
|
| 73 |
+
"pytest-sugar>=1.0.0",
|
| 74 |
+
"coverage>=7.6.1",
|
| 75 |
+
"git-cliff>=2.6.1",
|
| 76 |
+
"bump-my-version>=0.28.0",
|
| 77 |
+
"typos>=1.26.8",
|
| 78 |
+
"fonttools>=4.56.0",
|
| 79 |
+
]
|
| 80 |
+
|
| 81 |
+
## Test
|
| 82 |
+
[tool.mypy]
|
| 83 |
+
strict = true
|
| 84 |
+
exclude = ["venv", ".venv"]
|
| 85 |
+
|
| 86 |
+
[tool.pytest.ini_options]
|
| 87 |
+
# Set additional command line options for pytest
|
| 88 |
+
# Ref: https://docs.pytest.org/en/stable/reference/reference.html#command-line-flags
|
| 89 |
+
addopts = "-rXs --strict-config --strict-markers --tb=long"
|
| 90 |
+
xfail_strict = true # Treat tests that are marked as xfail but pass as test failures
|
| 91 |
+
filterwarnings = ["error"] # Treat all warnings as errors
|
| 92 |
+
pythonpath = "src/pdf2u/"
|
| 93 |
+
|
| 94 |
+
[tool.coverage.run]
|
| 95 |
+
branch = true
|
| 96 |
+
|
| 97 |
+
[tool.coverage.report]
|
| 98 |
+
skip_covered = true
|
| 99 |
+
show_missing = true
|
| 100 |
+
precision = 2
|
| 101 |
+
exclude_lines = [
|
| 102 |
+
'def __repr__',
|
| 103 |
+
'pragma= no cover',
|
| 104 |
+
'raise NotImplementedError',
|
| 105 |
+
'if TYPE_CHECKING=',
|
| 106 |
+
'if typing.TYPE_CHECKING=',
|
| 107 |
+
'@overload',
|
| 108 |
+
'@typing.overload',
|
| 109 |
+
'\(Protocol\)=$',
|
| 110 |
+
'typing.assert_never',
|
| 111 |
+
'assert_never',
|
| 112 |
+
'if __name__ == .__main__.=',
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
## Linter and formatter
|
| 116 |
+
[tool.ruff]
|
| 117 |
+
# cover and extend the default config in https=//docs.astral.sh/ruff/configuration/
|
| 118 |
+
extend-exclude = [""]
|
| 119 |
+
target-version = "py310"
|
| 120 |
+
|
| 121 |
+
[tool.ruff.lint]
|
| 122 |
+
select = [
|
| 123 |
+
"E", # pycodestyle errors
|
| 124 |
+
"W", # pycodestyle warnings
|
| 125 |
+
"F", # pyflakes
|
| 126 |
+
"I", # isort
|
| 127 |
+
"B", # flake8-bugbear
|
| 128 |
+
"C4", # flake8-comprehensions
|
| 129 |
+
"UP", # pyupgrade
|
| 130 |
+
"ARG001", # unused arguments in functions
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
isort = { combine-as-imports = true, split-on-trailing-comma = false }
|
| 134 |
+
|
| 135 |
+
# Avoid trying to fix flake8-bugbear (`B`) violations.
|
| 136 |
+
unfixable = ["B"]
|
| 137 |
+
|
| 138 |
+
[tool.ruff.format]
|
| 139 |
+
docstring-code-format = true
|
| 140 |
+
skip-magic-trailing-comma = true
|
| 141 |
+
|
| 142 |
+
# Reference
|
| 143 |
+
# 1. https=//github.com/Kludex/python-template/blob/main/template/%7B%7B%20project_slug%20%7D%7D/pyproject.toml.jinja
|
| 144 |
+
# 2. https=//github.com/fastapi/full-stack-fastapi-template/blob/master/backend/pyproject.toml
|
| 145 |
+
# 3. https=//github.com/pydantic/logfire
|
| 146 |
+
# 4. https=//coverage.readthedocs.io/en/latest/index.html
|
| 147 |
+
|
| 148 |
+
## VCS
|
| 149 |
+
[tool.git-cliff.remote.github]
|
| 150 |
+
owner = "atticuszeller"
|
| 151 |
+
repo = "python-uv-package"
|
| 152 |
+
|
| 153 |
+
[tool.git-cliff.changelog]
|
| 154 |
+
# template for the changelog header
|
| 155 |
+
header = """
|
| 156 |
+
# Changelog\n
|
| 157 |
+
All notable changes to this project will be documented in this file.\n
|
| 158 |
+
"""
|
| 159 |
+
# template for the changelog body
|
| 160 |
+
# https://keats.github.io/tera/docs/#introduction
|
| 161 |
+
body = """
|
| 162 |
+
{% if version %}\
|
| 163 |
+
## {{ version | trim_start_matches(pat="v") }} - {{ timestamp | date(format="%Y-%m-%d") }}
|
| 164 |
+
{% else %}\
|
| 165 |
+
## unreleased
|
| 166 |
+
{% endif %}\
|
| 167 |
+
{% for group, commits in commits | group_by(attribute="group") %}
|
| 168 |
+
### {{ group | striptags | trim | upper_first }}
|
| 169 |
+
{% for commit in commits| unique(attribute="message") %}
|
| 170 |
+
- {% if commit.scope %}*({{ commit.scope }})* {% endif %}\
|
| 171 |
+
{% if commit.breaking %}[**breaking**] {% endif %}\
|
| 172 |
+
{{ commit.message | upper_first }}\
|
| 173 |
+
{% if commit.remote.pr_number %} in #{{ commit.remote.pr_number }}{%- endif %}\
|
| 174 |
+
{% endfor %}
|
| 175 |
+
{% endfor %}\n
|
| 176 |
+
"""
|
| 177 |
+
# template for the changelog footer
|
| 178 |
+
footer = """
|
| 179 |
+
<!-- generated by git-cliff -->
|
| 180 |
+
"""
|
| 181 |
+
# remove the leading and trailings
|
| 182 |
+
trim = true
|
| 183 |
+
# postprocessors
|
| 184 |
+
# postprocessors = [
|
| 185 |
+
# { pattern = '<REPO>', replace = "https://github.com/atticuszeller/python-uv" }, # replace repository URL
|
| 186 |
+
# ]
|
| 187 |
+
# render body even when there are no releases to process
|
| 188 |
+
render_always = true
|
| 189 |
+
# output file path
|
| 190 |
+
output = "CHANGELOG.md"
|
| 191 |
+
|
| 192 |
+
[tool.git-cliff.git]
|
| 193 |
+
# parse the commits based on https://www.conventionalcommits.org
|
| 194 |
+
conventional_commits = true
|
| 195 |
+
# filter out the commits that are not conventional
|
| 196 |
+
filter_unconventional = true
|
| 197 |
+
# process each line of a commit as an individual commit
|
| 198 |
+
split_commits = false
|
| 199 |
+
# regex for preprocessing the commit messages
|
| 200 |
+
commit_preprocessors = [
|
| 201 |
+
# If the spelling is incorrect, it will be automatically fixed.
|
| 202 |
+
{ pattern = '.*', replace_command = 'typos --write-changes -' },
|
| 203 |
+
]
|
| 204 |
+
# regex for parsing and grouping commits
|
| 205 |
+
commit_parsers = [
|
| 206 |
+
{ message = "^feat", group = "<!-- 0 -->🚀 Features" },
|
| 207 |
+
{ message = "^fix", group = "<!-- 1 -->🐛 Bug Fixes" },
|
| 208 |
+
{ message = "^doc", group = "<!-- 3 -->📚 Documentation" },
|
| 209 |
+
{ message = "^perf", group = "<!-- 4 -->⚡ Performance" },
|
| 210 |
+
{ message = "^refactor", group = "<!-- 2 -->🚜 Refactor" },
|
| 211 |
+
{ message = "^style", group = "<!-- 5 -->🎨 Styling" },
|
| 212 |
+
{ message = "^test", group = "<!-- 6 -->🧪 Testing" },
|
| 213 |
+
{ message = "^chore\\(release\\)", skip = true },
|
| 214 |
+
{ message = "^chore\\(deps.*\\)", skip = true },
|
| 215 |
+
{ message = "^chore\\(pr\\)", skip = true },
|
| 216 |
+
{ message = "^chore\\(pull\\)", skip = true },
|
| 217 |
+
{ message = "^chore|^ci", group = "<!-- 7 -->⚙️ Miscellaneous Tasks" },
|
| 218 |
+
{ body = ".*security", group = "<!-- 8 -->🛡️ Security" },
|
| 219 |
+
{ message = "^revert", group = "<!-- 9 -->◀️ Revert" },
|
| 220 |
+
]
|
| 221 |
+
# filter out the commits that are not matched by commit parsers
|
| 222 |
+
filter_commits = false
|
| 223 |
+
# sort the tags topologically
|
| 224 |
+
topo_order = false
|
| 225 |
+
# sort the commits inside sections by oldest/newest order
|
| 226 |
+
sort_commits = "oldest"
|
| 227 |
+
|
| 228 |
+
[tool.bumpversion]
|
| 229 |
+
current_version = "0.0.4"
|
| 230 |
+
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
|
| 231 |
+
serialize = ["{major}.{minor}.{patch}"]
|
| 232 |
+
search = "{current_version}"
|
| 233 |
+
replace = "{new_version}"
|
| 234 |
+
regex = false
|
| 235 |
+
ignore_missing_version = false
|
| 236 |
+
ignore_missing_files = false
|
| 237 |
+
tag = true
|
| 238 |
+
sign_tags = false
|
| 239 |
+
tag_name = "v{new_version}"
|
| 240 |
+
tag_message = "chore(release): {current_version} → {new_version}"
|
| 241 |
+
allow_dirty = true # git-cliff first then bump patch
|
| 242 |
+
commit = true
|
| 243 |
+
message = "chore(release): {current_version} → {new_version}"
|
| 244 |
+
commit_args = ""
|
| 245 |
+
setup_hooks = []
|
| 246 |
+
pre_commit_hooks = []
|
| 247 |
+
post_commit_hooks = []
|
| 248 |
+
|
| 249 |
+
[[tool.bumpversion.files]]
|
| 250 |
+
filename = "src/pdf2u/__init__.py"
|
| 251 |
+
|
| 252 |
+
[[tool.bumpversion.files]]
|
| 253 |
+
filename = "pyproject.toml"
|
| 254 |
+
search = "version = \"{current_version}\""
|
| 255 |
+
replace = "version = \"{new_version}\""
|
| 256 |
+
|
| 257 |
+
[[tool.bumpversion.files]]
|
| 258 |
+
filename = "CHANGELOG.md"
|
| 259 |
+
search = "unreleased"
|
| 260 |
+
replace = "{new_version} - {now:%Y-%m-%d}"
|
| 261 |
+
|
| 262 |
+
# https://callowayproject.github.io/bump-my-version/reference/search-and-replace-config/
|
src/pdf2u/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__version__ = "0.0.4"
|
src/pdf2u/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (181 Bytes). View file
|
|
|
src/pdf2u/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (175 Bytes). View file
|
|
|
src/pdf2u/__pycache__/const.cpython-311.pyc
ADDED
|
Binary file (519 Bytes). View file
|
|
|
src/pdf2u/__pycache__/const.cpython-312.pyc
ADDED
|
Binary file (855 Bytes). View file
|
|
|
src/pdf2u/__pycache__/converter.cpython-311.pyc
ADDED
|
Binary file (13.8 kB). View file
|
|
|
src/pdf2u/__pycache__/converter.cpython-312.pyc
ADDED
|
Binary file (12.7 kB). View file
|
|
|
src/pdf2u/__pycache__/high_level.cpython-311.pyc
ADDED
|
Binary file (21.2 kB). View file
|
|
|
src/pdf2u/__pycache__/high_level.cpython-312.pyc
ADDED
|
Binary file (18.6 kB). View file
|
|
|
src/pdf2u/__pycache__/io.cpython-312.pyc
ADDED
|
Binary file (583 Bytes). View file
|
|
|
src/pdf2u/__pycache__/main.cpython-311.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
src/pdf2u/__pycache__/main.cpython-312.pyc
ADDED
|
Binary file (13.4 kB). View file
|
|
|
src/pdf2u/__pycache__/pdfinterp.cpython-311.pyc
ADDED
|
Binary file (23.7 kB). View file
|
|
|
src/pdf2u/__pycache__/pdfinterp.cpython-312.pyc
ADDED
|
Binary file (21.5 kB). View file
|
|
|
src/pdf2u/__pycache__/progress_monitor.cpython-311.pyc
ADDED
|
Binary file (9.5 kB). View file
|
|
|
src/pdf2u/__pycache__/progress_monitor.cpython-312.pyc
ADDED
|
Binary file (8.69 kB). View file
|
|
|
src/pdf2u/__pycache__/translation_config.cpython-311.pyc
ADDED
|
Binary file (8.22 kB). View file
|
|
|
src/pdf2u/__pycache__/translation_config.cpython-312.pyc
ADDED
|
Binary file (7.45 kB). View file
|
|
|
src/pdf2u/asynchronize/__init__.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Args:
|
| 6 |
+
def __init__(self, args, kwargs):
|
| 7 |
+
self.args = args
|
| 8 |
+
self.kwargs = kwargs
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AsyncCallback:
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.queue = asyncio.Queue()
|
| 14 |
+
self.finished = False
|
| 15 |
+
self.loop = asyncio.get_event_loop()
|
| 16 |
+
|
| 17 |
+
def step_callback(self, *args, **kwargs):
|
| 18 |
+
# Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
|
| 19 |
+
args = Args(args, kwargs)
|
| 20 |
+
|
| 21 |
+
# We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
|
| 22 |
+
# https://stackoverflow.com/a/49912853/2148718
|
| 23 |
+
self.loop.call_soon_threadsafe(self.queue.put_nowait, args)
|
| 24 |
+
|
| 25 |
+
# Add a small delay to release the GIL, ensuring the event loop has time to process messages
|
| 26 |
+
time.sleep(0.01)
|
| 27 |
+
|
| 28 |
+
def finished_callback(self, *args, **kwargs):
|
| 29 |
+
# Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
|
| 30 |
+
# will terminate after processing the remaining items
|
| 31 |
+
if self.finished:
|
| 32 |
+
return
|
| 33 |
+
self.step_callback(*args, **kwargs)
|
| 34 |
+
self.finished = True
|
| 35 |
+
|
| 36 |
+
def __await__(self):
|
| 37 |
+
# Since this implements __anext__, this can return itself
|
| 38 |
+
return self.queue.get().__await__()
|
| 39 |
+
|
| 40 |
+
def __aiter__(self):
|
| 41 |
+
# Since this implements __anext__, this can return itself
|
| 42 |
+
return self
|
| 43 |
+
|
| 44 |
+
async def __anext__(self):
|
| 45 |
+
# Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
|
| 46 |
+
# processing the remaining items even after we've finished
|
| 47 |
+
if self.finished and self.queue.empty():
|
| 48 |
+
raise StopAsyncIteration
|
| 49 |
+
|
| 50 |
+
result = await self.queue.get()
|
| 51 |
+
return result
|
src/pdf2u/asynchronize/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (2.69 kB). View file
|
|
|
src/pdf2u/asynchronize/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (2.5 kB). View file
|
|
|
src/pdf2u/const.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import StrEnum
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
CACHE_FOLDER = Path.home() / ".cache" / "pdf2u"
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_cache_file_path(filename: str) -> Path:
|
| 8 |
+
return CACHE_FOLDER / filename
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TranslationService(StrEnum):
|
| 12 |
+
OPENAI: str = "openai"
|
| 13 |
+
GOOGLE: str = "google"
|
| 14 |
+
BING: str = "bing"
|
src/pdf2u/converter.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import logging
|
| 3 |
+
import re
|
| 4 |
+
import unicodedata
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from pdfminer.converter import PDFConverter
|
| 8 |
+
from pdfminer.layout import LTChar, LTComponent, LTFigure, LTLine, LTPage, LTText
|
| 9 |
+
from pdfminer.pdfcolor import PDFColorSpace
|
| 10 |
+
from pdfminer.pdffont import PDFCIDFont, PDFFont, PDFUnicodeNotDefined
|
| 11 |
+
from pdfminer.pdfinterp import PDFGraphicState, PDFResourceManager
|
| 12 |
+
from pdfminer.utils import Matrix, apply_matrix_pt, bbox2str, matrix2str, mult_matrix
|
| 13 |
+
from pymupdf import Font
|
| 14 |
+
|
| 15 |
+
from pdf2u.document_il.frontend.il_creater import ILCreater
|
| 16 |
+
|
| 17 |
+
log = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PDFConverterEx(PDFConverter):
|
| 21 |
+
def __init__(
|
| 22 |
+
self, rsrcmgr: PDFResourceManager, il_creater: ILCreater | None = None
|
| 23 |
+
) -> None:
|
| 24 |
+
PDFConverter.__init__(self, rsrcmgr, None, "utf-8", 1, None)
|
| 25 |
+
self.il_creater = il_creater
|
| 26 |
+
|
| 27 |
+
def begin_page(self, page, ctm) -> None:
|
| 28 |
+
# 重载替换 cropbox
|
| 29 |
+
(x0, y0, x1, y1) = page.cropbox
|
| 30 |
+
(x0, y0) = apply_matrix_pt(ctm, (x0, y0))
|
| 31 |
+
(x1, y1) = apply_matrix_pt(ctm, (x1, y1))
|
| 32 |
+
mediabox = (0, 0, abs(x0 - x1), abs(y0 - y1))
|
| 33 |
+
self.il_creater.on_page_media_box(
|
| 34 |
+
mediabox[0], mediabox[1], mediabox[2], mediabox[3]
|
| 35 |
+
)
|
| 36 |
+
self.il_creater.on_page_number(page.pageno)
|
| 37 |
+
self.cur_item = LTPage(page.pageno, mediabox)
|
| 38 |
+
|
| 39 |
+
def end_page(self, _page) -> None:
|
| 40 |
+
# 重载返回指令流
|
| 41 |
+
return self.receive_layout(self.cur_item)
|
| 42 |
+
|
| 43 |
+
def begin_figure(self, name, bbox, matrix) -> None:
|
| 44 |
+
# 重载设置 pageid
|
| 45 |
+
self._stack.append(self.cur_item)
|
| 46 |
+
self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
|
| 47 |
+
self.cur_item.pageid = self._stack[-1].pageid
|
| 48 |
+
|
| 49 |
+
def end_figure(self, _: str) -> None:
|
| 50 |
+
# 重载返回指令流
|
| 51 |
+
fig = self.cur_item
|
| 52 |
+
if not isinstance(self.cur_item, LTFigure):
|
| 53 |
+
raise ValueError(f"Unexpected item type: {type(self.cur_item)}")
|
| 54 |
+
self.cur_item = self._stack.pop()
|
| 55 |
+
self.cur_item.add(fig)
|
| 56 |
+
return self.receive_layout(fig)
|
| 57 |
+
|
| 58 |
+
def render_char(
|
| 59 |
+
self,
|
| 60 |
+
matrix,
|
| 61 |
+
font,
|
| 62 |
+
fontsize: float,
|
| 63 |
+
scaling: float,
|
| 64 |
+
rise: float,
|
| 65 |
+
cid: int,
|
| 66 |
+
ncs,
|
| 67 |
+
graphicstate: PDFGraphicState,
|
| 68 |
+
) -> float:
|
| 69 |
+
# 重载设置 cid 和 font
|
| 70 |
+
try:
|
| 71 |
+
text = font.to_unichr(cid)
|
| 72 |
+
if not isinstance(text, str):
|
| 73 |
+
raise TypeError(f"Expected string, got {type(text)}")
|
| 74 |
+
except PDFUnicodeNotDefined:
|
| 75 |
+
text = self.handle_undefined_char(font, cid)
|
| 76 |
+
textwidth = font.char_width(cid)
|
| 77 |
+
textdisp = font.char_disp(cid)
|
| 78 |
+
|
| 79 |
+
font_name = font.fontname
|
| 80 |
+
if isinstance(font_name, bytes):
|
| 81 |
+
try:
|
| 82 |
+
font_name = font_name.decode("utf-8")
|
| 83 |
+
except UnicodeDecodeError:
|
| 84 |
+
font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
|
| 85 |
+
font_id = self.il_creater.current_page_font_name_id_map[font_name]
|
| 86 |
+
|
| 87 |
+
item = AWLTChar(
|
| 88 |
+
matrix,
|
| 89 |
+
font,
|
| 90 |
+
fontsize,
|
| 91 |
+
scaling,
|
| 92 |
+
rise,
|
| 93 |
+
text,
|
| 94 |
+
textwidth,
|
| 95 |
+
textdisp,
|
| 96 |
+
ncs,
|
| 97 |
+
graphicstate,
|
| 98 |
+
self.il_creater.xobj_id,
|
| 99 |
+
font_id,
|
| 100 |
+
)
|
| 101 |
+
self.cur_item.add(item)
|
| 102 |
+
item.cid = cid # hack 插入原字符编码
|
| 103 |
+
item.font = font # hack 插入原字符字体
|
| 104 |
+
return item.adv
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class AWLTChar(LTChar):
|
| 108 |
+
"""Actual letter in the text as a Unicode string."""
|
| 109 |
+
|
| 110 |
+
def __init__(
|
| 111 |
+
self,
|
| 112 |
+
matrix: Matrix,
|
| 113 |
+
font: PDFFont,
|
| 114 |
+
fontsize: float,
|
| 115 |
+
scaling: float,
|
| 116 |
+
rise: float,
|
| 117 |
+
text: str,
|
| 118 |
+
textwidth: float,
|
| 119 |
+
textdisp: float | tuple[float | None, float],
|
| 120 |
+
ncs: PDFColorSpace,
|
| 121 |
+
graphicstate: PDFGraphicState,
|
| 122 |
+
xobj_id: int,
|
| 123 |
+
font_id: str,
|
| 124 |
+
) -> None:
|
| 125 |
+
LTText.__init__(self)
|
| 126 |
+
self._text = text
|
| 127 |
+
self.matrix = matrix
|
| 128 |
+
self.fontname = font.fontname
|
| 129 |
+
self.ncs = ncs
|
| 130 |
+
self.graphicstate = graphicstate
|
| 131 |
+
self.xobj_id = xobj_id
|
| 132 |
+
self.adv = textwidth * fontsize * scaling
|
| 133 |
+
self.aw_font_id = font_id
|
| 134 |
+
# compute the boundary rectangle.
|
| 135 |
+
if font.is_vertical():
|
| 136 |
+
# vertical
|
| 137 |
+
assert isinstance(textdisp, tuple)
|
| 138 |
+
(vx, vy) = textdisp
|
| 139 |
+
if vx is None:
|
| 140 |
+
vx = fontsize * 0.5
|
| 141 |
+
else:
|
| 142 |
+
vx = vx * fontsize * 0.001
|
| 143 |
+
vy = (1000 - vy) * fontsize * 0.001
|
| 144 |
+
bbox_lower_left = (-vx, vy + rise + self.adv)
|
| 145 |
+
bbox_upper_right = (-vx + fontsize, vy + rise)
|
| 146 |
+
else:
|
| 147 |
+
# horizontal
|
| 148 |
+
descent = font.get_descent() * fontsize
|
| 149 |
+
bbox_lower_left = (0, descent + rise)
|
| 150 |
+
bbox_upper_right = (self.adv, descent + rise + fontsize)
|
| 151 |
+
(a, b, c, d, e, f) = self.matrix
|
| 152 |
+
self.upright = a * d * scaling > 0 and b * c <= 0
|
| 153 |
+
(x0, y0) = apply_matrix_pt(self.matrix, bbox_lower_left)
|
| 154 |
+
(x1, y1) = apply_matrix_pt(self.matrix, bbox_upper_right)
|
| 155 |
+
if x1 < x0:
|
| 156 |
+
(x0, x1) = (x1, x0)
|
| 157 |
+
if y1 < y0:
|
| 158 |
+
(y0, y1) = (y1, y0)
|
| 159 |
+
LTComponent.__init__(self, (x0, y0, x1, y1))
|
| 160 |
+
if font.is_vertical() or matrix[0] == 0:
|
| 161 |
+
self.size = self.width
|
| 162 |
+
else:
|
| 163 |
+
self.size = self.height
|
| 164 |
+
return
|
| 165 |
+
|
| 166 |
+
def __repr__(self) -> str:
|
| 167 |
+
return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
|
| 168 |
+
|
| 169 |
+
def get_text(self) -> str:
|
| 170 |
+
return self._text
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
class Paragraph:
|
| 174 |
+
def __init__(self, y, x, x0, x1, size, brk):
|
| 175 |
+
self.y: float = y # 初始纵坐标
|
| 176 |
+
self.x: float = x # 初始横坐标
|
| 177 |
+
self.x0: float = x0 # 左边界
|
| 178 |
+
self.x1: float = x1 # 右边界
|
| 179 |
+
self.size: float = size # 字体大小
|
| 180 |
+
self.brk: bool = brk # 换行标记
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# fmt: off
|
| 184 |
+
class TranslateConverter(PDFConverterEx):
|
| 185 |
+
def __init__(
|
| 186 |
+
self,
|
| 187 |
+
rsrcmgr,
|
| 188 |
+
vfont: str | None = None,
|
| 189 |
+
vchar: str | None = None,
|
| 190 |
+
thread: int = 0,
|
| 191 |
+
layout: dict | None = None,
|
| 192 |
+
lang_in: str = "", # 保留参数但添加未使用标记
|
| 193 |
+
_lang_out: str = "", # 改为未使用参数
|
| 194 |
+
_service: str = "", # 改为未使用参数
|
| 195 |
+
resfont: str = "",
|
| 196 |
+
noto: Font | None = None,
|
| 197 |
+
envs: dict | None = None,
|
| 198 |
+
_prompt: list | None = None, # 改为未使用参数
|
| 199 |
+
il_creater: ILCreater | None = None,
|
| 200 |
+
):
|
| 201 |
+
layout = layout or {}
|
| 202 |
+
super().__init__(rsrcmgr, il_creater)
|
| 203 |
+
self.vfont = vfont
|
| 204 |
+
self.vchar = vchar
|
| 205 |
+
self.thread = thread
|
| 206 |
+
self.layout = layout
|
| 207 |
+
self.resfont = resfont
|
| 208 |
+
self.noto = noto
|
| 209 |
+
|
| 210 |
+
def receive_layout(self, ltpage: LTPage):
|
| 211 |
+
# 段落
|
| 212 |
+
sstk: list[str] = [] # 段落文字栈
|
| 213 |
+
pstk: list[Paragraph] = [] # 段落属性栈
|
| 214 |
+
vbkt: int = 0 # 段落公式括号计数
|
| 215 |
+
# 公式组
|
| 216 |
+
vstk: list[LTChar] = [] # 公式符号组
|
| 217 |
+
vlstk: list[LTLine] = [] # 公式线条组
|
| 218 |
+
vfix: float = 0 # 公式纵向偏移
|
| 219 |
+
# 公式组栈
|
| 220 |
+
var: list[list[LTChar]] = [] # 公式符号组栈
|
| 221 |
+
varl: list[list[LTLine]] = [] # 公式线条组栈
|
| 222 |
+
varf: list[float] = [] # 公式纵向偏移栈
|
| 223 |
+
vlen: list[float] = [] # 公式宽度栈
|
| 224 |
+
# 全局
|
| 225 |
+
lstk: list[LTLine] = [] # 全局线条栈
|
| 226 |
+
xt: LTChar = None # 上一个字符
|
| 227 |
+
xt_cls: int = -1 # 上一个字符所属段落,保证无论第一个字符属于哪个类别都可以触发新段落
|
| 228 |
+
vmax: float = ltpage.width / 4 # 行内公式最大宽度
|
| 229 |
+
ops: str = "" # 渲染结果
|
| 230 |
+
|
| 231 |
+
def vflag(font: str, char: str): # 匹配公式(和角标)字体
|
| 232 |
+
if isinstance(font, bytes): # 不一定能 decode,直接转 str
|
| 233 |
+
font = str(font)
|
| 234 |
+
font = font.split("+")[-1] # 字体名截断
|
| 235 |
+
if re.match(r"\(cid:", char):
|
| 236 |
+
return True
|
| 237 |
+
# 基于字体名规则的判定
|
| 238 |
+
if self.vfont:
|
| 239 |
+
if re.match(self.vfont, font):
|
| 240 |
+
return True
|
| 241 |
+
else:
|
| 242 |
+
if re.match( # latex 字体
|
| 243 |
+
r"(CM[^R]|(MS|XY|MT|BL|RM|EU|LA|RS)[A-Z]|LINE|LCIRCLE|TeX-|rsfs|txsy|wasy|stmary|.*Mono|.*Code|.*Ital|.*Sym|.*Math)",
|
| 244 |
+
font,
|
| 245 |
+
):
|
| 246 |
+
return True
|
| 247 |
+
# 基于字符集规则的判定
|
| 248 |
+
if self.vchar:
|
| 249 |
+
if re.match(self.vchar, char):
|
| 250 |
+
return True
|
| 251 |
+
else:
|
| 252 |
+
if (
|
| 253 |
+
char
|
| 254 |
+
and char != " " # 非空格
|
| 255 |
+
and (
|
| 256 |
+
unicodedata.category(char[0])
|
| 257 |
+
in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] # 文字修饰符、数学符号、分隔符号
|
| 258 |
+
or ord(char[0]) in range(0x370, 0x400) # 希腊字母
|
| 259 |
+
)
|
| 260 |
+
):
|
| 261 |
+
return True
|
| 262 |
+
return False
|
| 263 |
+
|
| 264 |
+
############################################################
|
| 265 |
+
# A. 原文档解析
|
| 266 |
+
for child in ltpage:
|
| 267 |
+
if isinstance(child, LTChar):
|
| 268 |
+
self.il_creater.on_lt_char(child)
|
| 269 |
+
continue
|
| 270 |
+
cur_v = False
|
| 271 |
+
layout = self.layout[ltpage.pageid]
|
| 272 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
| 273 |
+
h, w = layout.shape
|
| 274 |
+
# 读取当前字符在 layout 中的类别
|
| 275 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
| 276 |
+
cls = layout[cy, cx]
|
| 277 |
+
# 锚定文档中 bullet 的位置
|
| 278 |
+
if child.get_text() == "•":
|
| 279 |
+
cls = 0
|
| 280 |
+
# 判定当前字符是否属于公式
|
| 281 |
+
if ( # 判定当前字符是否属于公式
|
| 282 |
+
cls == 0 # 1. 类别为保留区域
|
| 283 |
+
or (cls == xt_cls and len(sstk[-1].strip()) > 1 and child.size < pstk[-1].size * 0.79) # 2. 角标字体,有 0.76 的角标和 0.799 的大写,这里用 0.79 取中,同时考虑首字母放大的情况
|
| 284 |
+
or vflag(child.fontname, child.get_text()) # 3. 公式字体
|
| 285 |
+
or (child.matrix[0] == 0 and child.matrix[3] == 0) # 4. 垂直字体
|
| 286 |
+
):
|
| 287 |
+
cur_v = True
|
| 288 |
+
# 判定括号组是否属于公式
|
| 289 |
+
if not cur_v:
|
| 290 |
+
if vstk and child.get_text() == "(":
|
| 291 |
+
cur_v = True
|
| 292 |
+
vbkt += 1
|
| 293 |
+
if vbkt and child.get_text() == ")":
|
| 294 |
+
cur_v = True
|
| 295 |
+
vbkt -= 1
|
| 296 |
+
if ( # 判定当前公式是否结束
|
| 297 |
+
not cur_v # 1. 当前字符不属于公式
|
| 298 |
+
or cls != xt_cls # 2. 当前字符与前一个字符不属于同一段落
|
| 299 |
+
# or (abs(child.x0 - xt.x0) > vmax and cls != 0) # 3. 段落内换行,可能是一长串斜体的段落,也可能是段内分式换行,这里设个阈值进行区分
|
| 300 |
+
# 禁止纯公式(代码)段落换行,直到文字开始再重开文字段落,保证只存在两种情况
|
| 301 |
+
# A. 纯公式(代码)段落(锚定绝对位置)sstk[-1]=="" -> sstk[-1]=="{v*}"
|
| 302 |
+
# B. 文字开头段落(排版相对位置)sstk[-1]!=""
|
| 303 |
+
or (sstk[-1] != "" and abs(child.x0 - xt.x0) > vmax) # 因为 cls==xt_cls==0 一定有 sstk[-1]=="",所以这里不需要再判定 cls!=0
|
| 304 |
+
):
|
| 305 |
+
if vstk:
|
| 306 |
+
if ( # 根据公式右侧的文字修正公式的纵向偏移
|
| 307 |
+
not cur_v # 1. 当前字符不属于公式
|
| 308 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
| 309 |
+
and child.x0 > max([vch.x0 for vch in vstk]) # 3. 当前字符在公式右侧
|
| 310 |
+
):
|
| 311 |
+
vfix = vstk[0].y0 - child.y0
|
| 312 |
+
if sstk[-1] == "":
|
| 313 |
+
xt_cls = -1 # 禁止纯公式段落(sstk[-1]=="{v*}")的后续连接,但是要考虑新字符和后续字符的连接,所以这里修改的是上个字符的类别
|
| 314 |
+
sstk[-1] += f"{{v{len(var)}}}"
|
| 315 |
+
var.append(vstk)
|
| 316 |
+
varl.append(vlstk)
|
| 317 |
+
varf.append(vfix)
|
| 318 |
+
vstk = []
|
| 319 |
+
vlstk = []
|
| 320 |
+
vfix = 0
|
| 321 |
+
# 当前字符不属于公式或当前字符是公式的第一个字符
|
| 322 |
+
if not vstk:
|
| 323 |
+
if cls == xt_cls: # 当前字符与前一个字符属于同一段落
|
| 324 |
+
if child.x0 > xt.x1 + 1: # 添加行内空格
|
| 325 |
+
sstk[-1] += " "
|
| 326 |
+
elif child.x1 < xt.x0: # 添加换行空格并标记原文段落存在换行
|
| 327 |
+
sstk[-1] += " "
|
| 328 |
+
pstk[-1].brk = True
|
| 329 |
+
else: # 根据当前字符构建一个新的段落
|
| 330 |
+
sstk.append("")
|
| 331 |
+
pstk.append(Paragraph(child.y0, child.x0, child.x0, child.x0, child.size, False))
|
| 332 |
+
if not cur_v: # 文字入栈
|
| 333 |
+
if ( # 根据当前字符修正段落属性
|
| 334 |
+
child.size > pstk[-1].size / 0.79 # 1. 当前字符显著比段落字体大
|
| 335 |
+
or len(sstk[-1].strip()) == 1 # 2. 当前字符为段落第二个文字(考虑首字母放大的情况)
|
| 336 |
+
) and child.get_text() != " ": # 3. 当前字符不是空格
|
| 337 |
+
pstk[-1].y -= child.size - pstk[-1].size # 修正段落初始纵坐标,假设两个不同大小字符的上边界对齐
|
| 338 |
+
pstk[-1].size = child.size
|
| 339 |
+
sstk[-1] += child.get_text()
|
| 340 |
+
else: # 公式入栈
|
| 341 |
+
if ( # 根据公式左侧的文字修正公式的纵向偏移
|
| 342 |
+
not vstk # 1. 当前字符是公式的第一个字符
|
| 343 |
+
and cls == xt_cls # 2. 当前字符与前一个字符属于同一段落
|
| 344 |
+
and child.x0 > xt.x0 # 3. 前一个字符在公式左侧
|
| 345 |
+
):
|
| 346 |
+
vfix = child.y0 - xt.y0
|
| 347 |
+
vstk.append(child)
|
| 348 |
+
# 更新段落边界,因为段落内换行之后可能是公式开头,所以要在外边处理
|
| 349 |
+
pstk[-1].x0 = min(pstk[-1].x0, child.x0)
|
| 350 |
+
pstk[-1].x1 = max(pstk[-1].x1, child.x1)
|
| 351 |
+
# 更新上一个字符
|
| 352 |
+
xt = child
|
| 353 |
+
xt_cls = cls
|
| 354 |
+
elif isinstance(child, LTFigure):
|
| 355 |
+
# 图表
|
| 356 |
+
self.il_creater.on_pdf_figure(child)
|
| 357 |
+
pass
|
| 358 |
+
elif isinstance(child, LTLine): # 线条
|
| 359 |
+
continue
|
| 360 |
+
layout = self.layout[ltpage.pageid]
|
| 361 |
+
# ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
|
| 362 |
+
h, w = layout.shape
|
| 363 |
+
# 读取当前线条在 layout 中的类别
|
| 364 |
+
cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(int(child.y0), 0, h - 1)
|
| 365 |
+
cls = layout[cy, cx]
|
| 366 |
+
if vstk and cls == xt_cls: # 公式线条
|
| 367 |
+
vlstk.append(child)
|
| 368 |
+
else: # 全局线条
|
| 369 |
+
lstk.append(child)
|
| 370 |
+
else:
|
| 371 |
+
pass
|
| 372 |
+
return
|
| 373 |
+
# 处理结尾
|
| 374 |
+
if vstk: # 公式出栈
|
| 375 |
+
sstk[-1] += f"{{v{len(var)}}}"
|
| 376 |
+
var.append(vstk)
|
| 377 |
+
varl.append(vlstk)
|
| 378 |
+
varf.append(vfix)
|
| 379 |
+
log.debug("\n==========[VSTACK]==========\n")
|
| 380 |
+
for var_id, v in enumerate(var): # 计算公式宽度
|
| 381 |
+
l = max([vch.x1 for vch in v]) - v[0].x0
|
| 382 |
+
log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[var_id])} > v{var_id} = {"".join([ch.get_text() for ch in v])}')
|
| 383 |
+
vlen.append(l)
|
| 384 |
+
|
| 385 |
+
############################################################
|
| 386 |
+
# B. 段落翻译
|
| 387 |
+
log.debug("\n==========[SSTACK]==========\n")
|
| 388 |
+
|
| 389 |
+
news = sstk.copy()
|
| 390 |
+
|
| 391 |
+
############################################################
|
| 392 |
+
# C. 新文档排版
|
| 393 |
+
def raw_string(fcur: str, cstk: str): # 编码字符串
|
| 394 |
+
if fcur == 'noto':
|
| 395 |
+
return "".join([f"{self.noto.has_glyph(ord(c)):04x}" for c in cstk])
|
| 396 |
+
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
|
| 397 |
+
return "".join([f"{ord(c):04x}" for c in cstk])
|
| 398 |
+
else:
|
| 399 |
+
return "".join([f"{ord(c):02x}" for c in cstk])
|
| 400 |
+
|
| 401 |
+
_x, _y = 0, 0
|
| 402 |
+
for para_id, new in enumerate(news):
|
| 403 |
+
x: float = pstk[para_id].x # 段落初始横坐标
|
| 404 |
+
y: float = pstk[para_id].y # 段落初始纵坐标
|
| 405 |
+
x0: float = pstk[para_id].x0 # 段落左边界
|
| 406 |
+
x1: float = pstk[para_id].x1 # 段落右边界
|
| 407 |
+
size: float = pstk[para_id].size # 段落字体大小
|
| 408 |
+
brk: bool = pstk[para_id].brk # 段落换行标记
|
| 409 |
+
cstk: str = "" # 当前文字栈
|
| 410 |
+
fcur: str = None # 当前字体 ID
|
| 411 |
+
tx = x
|
| 412 |
+
fcur_ = fcur
|
| 413 |
+
ptr = 0
|
| 414 |
+
log.debug(f"< {y} {x} {x0} {x1} {size} {brk} > {sstk[para_id]} | {new}")
|
| 415 |
+
while ptr < len(new):
|
| 416 |
+
vy_regex = re.match(
|
| 417 |
+
r"\{\s*v([\d\s]+)\}", new[ptr:], re.IGNORECASE,
|
| 418 |
+
) # 匹配 {vn} 公式标记
|
| 419 |
+
mod = 0 # 文字修饰符
|
| 420 |
+
if vy_regex: # 加载公式
|
| 421 |
+
ptr += len(vy_regex.group(0))
|
| 422 |
+
try:
|
| 423 |
+
vid = int(vy_regex.group(1).replace(" ", ""))
|
| 424 |
+
adv = vlen[vid]
|
| 425 |
+
except Exception as e:
|
| 426 |
+
log.debug("Skipping formula placeholder due to: %s", e)
|
| 427 |
+
continue # 翻译器可能会自动补个越界的公式标记
|
| 428 |
+
if var[vid][-1].get_text() and unicodedata.category(var[vid][-1].get_text()[0]) in ["Lm", "Mn", "Sk"]: # 文字修饰符
|
| 429 |
+
mod = var[vid][-1].width
|
| 430 |
+
else: # 加载文字
|
| 431 |
+
ch = new[ptr]
|
| 432 |
+
fcur_ = None
|
| 433 |
+
try:
|
| 434 |
+
if fcur_ is None and self.fontmap["tiro"].to_unichr(ord(ch)) == ch:
|
| 435 |
+
fcur_ = "tiro" # 默认拉丁字体
|
| 436 |
+
except Exception:
|
| 437 |
+
pass
|
| 438 |
+
if fcur_ is None:
|
| 439 |
+
fcur_ = self.resfont # 默认非拉丁字体
|
| 440 |
+
if fcur_ == 'noto':
|
| 441 |
+
adv = self.noto.char_lengths(ch, size)[0]
|
| 442 |
+
else:
|
| 443 |
+
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
|
| 444 |
+
ptr += 1
|
| 445 |
+
if ( # 输出文字缓冲区
|
| 446 |
+
fcur_ != fcur # 1. 字体更新
|
| 447 |
+
or vy_regex # 2. 插入公式
|
| 448 |
+
or x + adv > x1 + 0.1 * size # 3. 到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
|
| 449 |
+
):
|
| 450 |
+
if cstk:
|
| 451 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
|
| 452 |
+
cstk = ""
|
| 453 |
+
if brk and x + adv > x1 + 0.1 * size: # 到达右边界且原文段落存在换行
|
| 454 |
+
x = x0
|
| 455 |
+
lang_space = {"zh-cn": 1.4, "zh-tw": 1.4, "zh-hans": 1.4, "zh-hant": 1.4, "zh": 1.4, "ja": 1.1, "ko": 1.2, "en": 1.2, "ar": 1.0, "ru": 0.8, "uk": 0.8, "ta": 0.8}
|
| 456 |
+
# y -= size * lang_space.get(self.translator.lang_out.lower(), 1.1) # 小语种大多适配 1.1
|
| 457 |
+
y -= size * 1.4
|
| 458 |
+
if vy_regex: # 插入公式
|
| 459 |
+
fix = 0
|
| 460 |
+
if fcur is not None: # 段落内公式修正纵向偏移
|
| 461 |
+
fix = varf[vid]
|
| 462 |
+
for vch in var[vid]: # 排版公式字符
|
| 463 |
+
vc = chr(vch.cid)
|
| 464 |
+
ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm <{raw_string(self.fontid[vch.font], vc)}> TJ "
|
| 465 |
+
if log.isEnabledFor(logging.DEBUG):
|
| 466 |
+
lstk.append(LTLine(0.1, (_x, _y), (x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0)))
|
| 467 |
+
_x, _y = x + vch.x0 - var[vid][0].x0, fix + y + vch.y0 - var[vid][0].y0
|
| 468 |
+
for l in varl[vid]: # 排版公式线条
|
| 469 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
| 470 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
| 471 |
+
else: # 插入文字缓冲区
|
| 472 |
+
if not cstk: # 单行开头
|
| 473 |
+
tx = x
|
| 474 |
+
if x == x0 and ch == " ": # 消除段落换行空格
|
| 475 |
+
adv = 0
|
| 476 |
+
else:
|
| 477 |
+
cstk += ch
|
| 478 |
+
else:
|
| 479 |
+
cstk += ch
|
| 480 |
+
adv -= mod # 文字修饰符
|
| 481 |
+
fcur = fcur_
|
| 482 |
+
x += adv
|
| 483 |
+
if log.isEnabledFor(logging.DEBUG):
|
| 484 |
+
lstk.append(LTLine(0.1, (_x, _y), (x, y)))
|
| 485 |
+
_x, _y = x, y
|
| 486 |
+
# 处理结尾
|
| 487 |
+
if cstk:
|
| 488 |
+
ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm <{raw_string(fcur, cstk)}> TJ "
|
| 489 |
+
for l in lstk: # 排版全局线条
|
| 490 |
+
if l.linewidth < 5: # hack 有的文档会用粗线条当图片背景
|
| 491 |
+
ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT "
|
| 492 |
+
ops = f"BT {ops}ET "
|
| 493 |
+
return ops
|
src/pdf2u/document_il/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pdf2u.document_il.il_version_1 import (
|
| 2 |
+
BaseOperations,
|
| 3 |
+
Box,
|
| 4 |
+
Cropbox,
|
| 5 |
+
Document,
|
| 6 |
+
GraphicState,
|
| 7 |
+
Mediabox,
|
| 8 |
+
Page,
|
| 9 |
+
PageLayout,
|
| 10 |
+
PdfCharacter,
|
| 11 |
+
PdfFigure,
|
| 12 |
+
PdfFont,
|
| 13 |
+
PdfFormula,
|
| 14 |
+
PdfLine,
|
| 15 |
+
PdfParagraph,
|
| 16 |
+
PdfParagraphComposition,
|
| 17 |
+
PdfRectangle,
|
| 18 |
+
PdfSameStyleCharacters,
|
| 19 |
+
PdfSameStyleUnicodeCharacters,
|
| 20 |
+
PdfStyle,
|
| 21 |
+
PdfXobject,
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
__all__ = [
|
| 25 |
+
"BaseOperations",
|
| 26 |
+
"Box",
|
| 27 |
+
"Cropbox",
|
| 28 |
+
"Document",
|
| 29 |
+
"GraphicState",
|
| 30 |
+
"Mediabox",
|
| 31 |
+
"Page",
|
| 32 |
+
"PageLayout",
|
| 33 |
+
"PdfCharacter",
|
| 34 |
+
"PdfFigure",
|
| 35 |
+
"PdfFont",
|
| 36 |
+
"PdfFormula",
|
| 37 |
+
"PdfLine",
|
| 38 |
+
"PdfParagraph",
|
| 39 |
+
"PdfParagraphComposition",
|
| 40 |
+
"PdfRectangle",
|
| 41 |
+
"PdfSameStyleCharacters",
|
| 42 |
+
"PdfSameStyleUnicodeCharacters",
|
| 43 |
+
"PdfStyle",
|
| 44 |
+
"PdfXobject",
|
| 45 |
+
]
|
src/pdf2u/document_il/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (912 Bytes). View file
|
|
|
src/pdf2u/document_il/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (706 Bytes). View file
|
|
|
src/pdf2u/document_il/__pycache__/il_version_1.cpython-311.pyc
ADDED
|
Binary file (22 kB). View file
|
|
|
src/pdf2u/document_il/__pycache__/il_version_1.cpython-312.pyc
ADDED
|
Binary file (17.1 kB). View file
|
|
|
src/pdf2u/document_il/__pycache__/xml_converter.cpython-311.pyc
ADDED
|
Binary file (4.42 kB). View file
|
|
|
src/pdf2u/document_il/__pycache__/xml_converter.cpython-312.pyc
ADDED
|
Binary file (3.81 kB). View file
|
|
|
src/pdf2u/document_il/backend/__init__.py
ADDED
|
File without changes
|
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (181 Bytes). View file
|
|
|
src/pdf2u/document_il/backend/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (169 Bytes). View file
|
|
|
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-311.pyc
ADDED
|
Binary file (19.8 kB). View file
|
|
|
src/pdf2u/document_il/backend/__pycache__/pdf_creater.cpython-312.pyc
ADDED
|
Binary file (18.5 kB). View file
|
|
|
src/pdf2u/document_il/backend/pdf_creater.py
ADDED
|
@@ -0,0 +1,405 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import re
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import pymupdf
|
| 6 |
+
from bitstring import BitStream
|
| 7 |
+
|
| 8 |
+
from pdf2u.document_il import il_version_1
|
| 9 |
+
from pdf2u.document_il.utils.fontmap import FontMapper
|
| 10 |
+
from pdf2u.translation_config import TranslateResult, TranslationConfig
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
SUBSET_FONT_STAGE_NAME = "Subset font"
|
| 15 |
+
SAVE_PDF_STAGE_NAME = "Save PDF"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PDFCreater:
|
| 19 |
+
stage_name = "Generate drawing instructions"
|
| 20 |
+
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
original_pdf_path: str,
|
| 24 |
+
document: il_version_1.Document,
|
| 25 |
+
translation_config: TranslationConfig,
|
| 26 |
+
):
|
| 27 |
+
self.original_pdf_path = original_pdf_path
|
| 28 |
+
self.docs = document
|
| 29 |
+
self.font_path = translation_config.font
|
| 30 |
+
self.font_mapper = FontMapper(translation_config)
|
| 31 |
+
self.translation_config = translation_config
|
| 32 |
+
|
| 33 |
+
def render_graphic_state(
|
| 34 |
+
self, draw_op: BitStream, graphic_state: il_version_1.GraphicState
|
| 35 |
+
):
|
| 36 |
+
if graphic_state is None:
|
| 37 |
+
return
|
| 38 |
+
# if graphic_state.stroking_color_space_name:
|
| 39 |
+
# draw_op.append(
|
| 40 |
+
# f"/{graphic_state.stroking_color_space_name} CS \n".encode()
|
| 41 |
+
# )
|
| 42 |
+
# if graphic_state.non_stroking_color_space_name:
|
| 43 |
+
# draw_op.append(
|
| 44 |
+
# f"/{graphic_state.non_stroking_color_space_name}"
|
| 45 |
+
# f" cs \n".encode()
|
| 46 |
+
# )
|
| 47 |
+
# if graphic_state.ncolor is not None:
|
| 48 |
+
# if len(graphic_state.ncolor) == 1:
|
| 49 |
+
# draw_op.append(f"{graphic_state.ncolor[0]} g \n".encode())
|
| 50 |
+
# elif len(graphic_state.ncolor) == 3:
|
| 51 |
+
# draw_op.append(
|
| 52 |
+
# f"{' '.join((str(x) for x in graphic_state.ncolor))} sc \n".encode()
|
| 53 |
+
# )
|
| 54 |
+
# if graphic_state.scolor is not None:
|
| 55 |
+
# if len(graphic_state.scolor) == 1:
|
| 56 |
+
# draw_op.append(f"{graphic_state.scolor[0]} G \n".encode())
|
| 57 |
+
# elif len(graphic_state.scolor) == 3:
|
| 58 |
+
# draw_op.append(
|
| 59 |
+
# f"{' '.join((str(x) for x in graphic_state.scolor))} SC \n".encode()
|
| 60 |
+
# )
|
| 61 |
+
|
| 62 |
+
if graphic_state.passthrough_per_char_instruction:
|
| 63 |
+
draw_op.append(
|
| 64 |
+
f"{graphic_state.passthrough_per_char_instruction} \n".encode()
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def render_paragraph_to_char(
|
| 68 |
+
self, paragraph: il_version_1.PdfParagraph
|
| 69 |
+
) -> list[il_version_1.PdfCharacter]:
|
| 70 |
+
chars = []
|
| 71 |
+
for composition in paragraph.pdf_paragraph_composition:
|
| 72 |
+
if not isinstance(composition.pdf_character, il_version_1.PdfCharacter):
|
| 73 |
+
logger.error(
|
| 74 |
+
f"Unknown composition type. "
|
| 75 |
+
f"This type only appears in the IL "
|
| 76 |
+
f"after the translation is completed."
|
| 77 |
+
f"During pdf rendering, this type is not supported."
|
| 78 |
+
f"Composition: {composition}. "
|
| 79 |
+
f"Paragraph: {paragraph}. "
|
| 80 |
+
)
|
| 81 |
+
continue
|
| 82 |
+
chars.append(composition.pdf_character)
|
| 83 |
+
if not chars and paragraph.unicode:
|
| 84 |
+
logger.error(
|
| 85 |
+
f"Unable to export paragraphs that have "
|
| 86 |
+
f"not yet been formatted: {paragraph}"
|
| 87 |
+
)
|
| 88 |
+
return chars
|
| 89 |
+
return chars
|
| 90 |
+
|
| 91 |
+
def get_available_font_list(self, pdf, page):
|
| 92 |
+
page_xref_id = pdf[page.page_number].xref
|
| 93 |
+
return self.get_xobj_available_fonts(page_xref_id, pdf)
|
| 94 |
+
|
| 95 |
+
def get_xobj_available_fonts(self, page_xref_id, pdf):
|
| 96 |
+
resources_type, r_id = pdf.xref_get_key(page_xref_id, "Resources")
|
| 97 |
+
if resources_type == "xref":
|
| 98 |
+
resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
|
| 99 |
+
r_id = pdf.xref_object(int(resource_xref_id))
|
| 100 |
+
resources_type = "dict"
|
| 101 |
+
if resources_type == "dict":
|
| 102 |
+
xref_id = re.search("/Font (\\d+) 0 R", r_id)
|
| 103 |
+
if xref_id is not None:
|
| 104 |
+
xref_id = xref_id.group(1)
|
| 105 |
+
font_dict = pdf.xref_object(int(xref_id))
|
| 106 |
+
else:
|
| 107 |
+
search = re.search("/Font *<<(.+?)>>", r_id.replace("\n", " "))
|
| 108 |
+
if search is None:
|
| 109 |
+
# Have resources but no fonts
|
| 110 |
+
return set()
|
| 111 |
+
font_dict = search.group(1)
|
| 112 |
+
else:
|
| 113 |
+
r_id = int(r_id.split(" ")[0])
|
| 114 |
+
_, font_dict = pdf.xref_get_key(r_id, "Font")
|
| 115 |
+
fonts = re.findall("/([^ ]+?) ", font_dict)
|
| 116 |
+
return set(fonts)
|
| 117 |
+
|
| 118 |
+
def _debug_render_rectangle(
|
| 119 |
+
self, draw_op: BitStream, rectangle: il_version_1.PdfRectangle
|
| 120 |
+
):
|
| 121 |
+
"""Draw a debug rectangle in PDF for visualization purposes.
|
| 122 |
+
|
| 123 |
+
Args:
|
| 124 |
+
draw_op: BitStream to append PDF drawing operations
|
| 125 |
+
rectangle: Rectangle object containing position information
|
| 126 |
+
"""
|
| 127 |
+
x1 = rectangle.box.x
|
| 128 |
+
y1 = rectangle.box.y
|
| 129 |
+
x2 = rectangle.box.x2
|
| 130 |
+
y2 = rectangle.box.y2
|
| 131 |
+
# Save graphics state
|
| 132 |
+
draw_op.append(b"q ")
|
| 133 |
+
|
| 134 |
+
# Set green color for debug visibility
|
| 135 |
+
draw_op.append(
|
| 136 |
+
rectangle.graphic_state.passthrough_per_char_instruction.encode()
|
| 137 |
+
) # Green stroke
|
| 138 |
+
draw_op.append(b" 1 w ") # Line width
|
| 139 |
+
|
| 140 |
+
# Draw four lines manually
|
| 141 |
+
# Bottom line
|
| 142 |
+
draw_op.append(f"{x1} {y1} m {x2} {y1} l S ".encode())
|
| 143 |
+
# Right line
|
| 144 |
+
draw_op.append(f"{x2} {y1} m {x2} {y2} l S ".encode())
|
| 145 |
+
# Top line
|
| 146 |
+
draw_op.append(f"{x2} {y2} m {x1} {y2} l S ".encode())
|
| 147 |
+
# Left line
|
| 148 |
+
draw_op.append(f"{x1} {y2} m {x1} {y1} l S ".encode())
|
| 149 |
+
|
| 150 |
+
# Restore graphics state
|
| 151 |
+
draw_op.append(b"Q\n")
|
| 152 |
+
|
| 153 |
+
def write_debug_info(
|
| 154 |
+
self, pdf: pymupdf.Document, translation_config: TranslationConfig
|
| 155 |
+
):
|
| 156 |
+
self.font_mapper.add_font(pdf, self.docs)
|
| 157 |
+
|
| 158 |
+
for page in self.docs.page:
|
| 159 |
+
_, r_id = pdf.xref_get_key(pdf[page.page_number].xref, "Contents")
|
| 160 |
+
resource_xref_id = re.search("(\\d+) 0 R", r_id).group(1)
|
| 161 |
+
base_op = pdf.xref_stream(int(resource_xref_id))
|
| 162 |
+
translation_config.raise_if_cancelled()
|
| 163 |
+
xobj_available_fonts = {}
|
| 164 |
+
xobj_draw_ops = {}
|
| 165 |
+
xobj_encoding_length_map = {}
|
| 166 |
+
available_font_list = self.get_available_font_list(pdf, page)
|
| 167 |
+
|
| 168 |
+
page_encoding_length_map = {
|
| 169 |
+
f.font_id: f.encoding_length for f in page.pdf_font
|
| 170 |
+
}
|
| 171 |
+
page_op = BitStream()
|
| 172 |
+
# q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
|
| 173 |
+
page_op.append(b"q ")
|
| 174 |
+
if base_op is not None:
|
| 175 |
+
page_op.append(base_op)
|
| 176 |
+
page_op.append(b" Q ")
|
| 177 |
+
page_op.append(
|
| 178 |
+
f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
|
| 179 |
+
)
|
| 180 |
+
# 收集所有字符
|
| 181 |
+
chars = []
|
| 182 |
+
# 首先添加页面级别的字符
|
| 183 |
+
if page.pdf_character:
|
| 184 |
+
chars.extend(page.pdf_character)
|
| 185 |
+
# 然后添加段落中的字符
|
| 186 |
+
for paragraph in page.pdf_paragraph:
|
| 187 |
+
chars.extend(self.render_paragraph_to_char(paragraph))
|
| 188 |
+
|
| 189 |
+
# 渲染所有字符
|
| 190 |
+
for char in chars:
|
| 191 |
+
if not getattr(char, "debug_info", False):
|
| 192 |
+
continue
|
| 193 |
+
if char.char_unicode == "\n":
|
| 194 |
+
continue
|
| 195 |
+
if char.pdf_character_id is None:
|
| 196 |
+
# dummy char
|
| 197 |
+
continue
|
| 198 |
+
char_size = char.pdf_style.font_size
|
| 199 |
+
font_id = char.pdf_style.font_id
|
| 200 |
+
|
| 201 |
+
if font_id not in available_font_list:
|
| 202 |
+
continue
|
| 203 |
+
draw_op = page_op
|
| 204 |
+
encoding_length_map = page_encoding_length_map
|
| 205 |
+
|
| 206 |
+
draw_op.append(b"q ")
|
| 207 |
+
self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
|
| 208 |
+
if char.vertical:
|
| 209 |
+
draw_op.append(
|
| 210 |
+
f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
|
| 211 |
+
)
|
| 212 |
+
else:
|
| 213 |
+
draw_op.append(
|
| 214 |
+
f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
encoding_length = encoding_length_map[font_id]
|
| 218 |
+
# pdf32000-2008 page14:
|
| 219 |
+
# As hexadecimal data enclosed in angle brackets < >
|
| 220 |
+
# see 7.3.4.3, "Hexadecimal Strings."
|
| 221 |
+
draw_op.append(
|
| 222 |
+
f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
draw_op.append(b" Tj ET Q \n")
|
| 226 |
+
for rect in page.pdf_rectangle:
|
| 227 |
+
if not rect.debug_info:
|
| 228 |
+
continue
|
| 229 |
+
self._debug_render_rectangle(page_op, rect)
|
| 230 |
+
draw_op = page_op
|
| 231 |
+
# Since this is a draw instruction container,
|
| 232 |
+
# no additional information is needed
|
| 233 |
+
pdf.update_stream(int(resource_xref_id), draw_op.tobytes())
|
| 234 |
+
translation_config.raise_if_cancelled()
|
| 235 |
+
pdf.subset_fonts(fallback=False)
|
| 236 |
+
|
| 237 |
+
def write(self, translation_config: TranslationConfig) -> TranslateResult:
|
| 238 |
+
basename = Path(translation_config.input_file).stem
|
| 239 |
+
debug_suffix = ".debug" if translation_config.debug else ""
|
| 240 |
+
mono_out_path = translation_config.get_output_file_path(
|
| 241 |
+
f"{basename}{debug_suffix}.{translation_config.lang_out}.mono.pdf"
|
| 242 |
+
)
|
| 243 |
+
pdf = pymupdf.open(self.original_pdf_path)
|
| 244 |
+
self.font_mapper.add_font(pdf, self.docs)
|
| 245 |
+
with self.translation_config.progress_monitor.stage_start(
|
| 246 |
+
self.stage_name, len(self.docs.page)
|
| 247 |
+
) as pbar:
|
| 248 |
+
for page in self.docs.page:
|
| 249 |
+
translation_config.raise_if_cancelled()
|
| 250 |
+
xobj_available_fonts = {}
|
| 251 |
+
xobj_draw_ops = {}
|
| 252 |
+
xobj_encoding_length_map = {}
|
| 253 |
+
available_font_list = self.get_available_font_list(pdf, page)
|
| 254 |
+
|
| 255 |
+
for xobj in page.pdf_xobject:
|
| 256 |
+
xobj_available_fonts[xobj.xobj_id] = available_font_list.copy()
|
| 257 |
+
try:
|
| 258 |
+
xobj_available_fonts[xobj.xobj_id].update(
|
| 259 |
+
self.get_xobj_available_fonts(xobj.xref_id, pdf)
|
| 260 |
+
)
|
| 261 |
+
except Exception:
|
| 262 |
+
pass
|
| 263 |
+
xobj_encoding_length_map[xobj.xobj_id] = {
|
| 264 |
+
f.font_id: f.encoding_length for f in xobj.pdf_font
|
| 265 |
+
}
|
| 266 |
+
xobj_op = BitStream()
|
| 267 |
+
xobj_op.append(xobj.base_operations.value.encode())
|
| 268 |
+
xobj_draw_ops[xobj.xobj_id] = xobj_op
|
| 269 |
+
page_encoding_length_map = {
|
| 270 |
+
f.font_id: f.encoding_length for f in page.pdf_font
|
| 271 |
+
}
|
| 272 |
+
page_op = BitStream()
|
| 273 |
+
# q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}
|
| 274 |
+
page_op.append(b"q ")
|
| 275 |
+
page_op.append(page.base_operations.value.encode())
|
| 276 |
+
page_op.append(b" Q ")
|
| 277 |
+
page_op.append(
|
| 278 |
+
f"q Q 1 0 0 1 {page.cropbox.box.x} {page.cropbox.box.y} cm \n".encode()
|
| 279 |
+
)
|
| 280 |
+
# 收集所有字符
|
| 281 |
+
chars = []
|
| 282 |
+
# 首先添加页面级别的字符
|
| 283 |
+
if page.pdf_character:
|
| 284 |
+
chars.extend(page.pdf_character)
|
| 285 |
+
# 然后添加段落中的字符
|
| 286 |
+
for paragraph in page.pdf_paragraph:
|
| 287 |
+
chars.extend(self.render_paragraph_to_char(paragraph))
|
| 288 |
+
|
| 289 |
+
# 渲染所有字符
|
| 290 |
+
for char in chars:
|
| 291 |
+
if char.char_unicode == "\n":
|
| 292 |
+
continue
|
| 293 |
+
if char.pdf_character_id is None:
|
| 294 |
+
# dummy char
|
| 295 |
+
continue
|
| 296 |
+
char_size = char.pdf_style.font_size
|
| 297 |
+
font_id = char.pdf_style.font_id
|
| 298 |
+
if char.xobj_id in xobj_available_fonts:
|
| 299 |
+
if font_id not in xobj_available_fonts[char.xobj_id]:
|
| 300 |
+
continue
|
| 301 |
+
draw_op = xobj_draw_ops[char.xobj_id]
|
| 302 |
+
encoding_length_map = xobj_encoding_length_map[char.xobj_id]
|
| 303 |
+
else:
|
| 304 |
+
if font_id not in available_font_list:
|
| 305 |
+
continue
|
| 306 |
+
draw_op = page_op
|
| 307 |
+
encoding_length_map = page_encoding_length_map
|
| 308 |
+
|
| 309 |
+
draw_op.append(b"q ")
|
| 310 |
+
self.render_graphic_state(draw_op, char.pdf_style.graphic_state)
|
| 311 |
+
if char.vertical:
|
| 312 |
+
draw_op.append(
|
| 313 |
+
f"BT /{font_id} {char_size:f} Tf 0 1 -1 0 {char.box.x2:f} {char.box.y:f} Tm ".encode()
|
| 314 |
+
)
|
| 315 |
+
else:
|
| 316 |
+
draw_op.append(
|
| 317 |
+
f"BT /{font_id} {char_size:f} Tf 1 0 0 1 {char.box.x:f} {char.box.y:f} Tm ".encode()
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
encoding_length = encoding_length_map[font_id]
|
| 321 |
+
# pdf32000-2008 page14:
|
| 322 |
+
# As hexadecimal data enclosed in angle brackets < >
|
| 323 |
+
# see 7.3.4.3, "Hexadecimal Strings."
|
| 324 |
+
draw_op.append(
|
| 325 |
+
f"<{char.pdf_character_id:0{encoding_length * 2}x}>".upper().encode()
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
draw_op.append(b" Tj ET Q \n")
|
| 329 |
+
for xobj in page.pdf_xobject:
|
| 330 |
+
draw_op = xobj_draw_ops[xobj.xobj_id]
|
| 331 |
+
pdf.update_stream(xobj.xref_id, draw_op.tobytes())
|
| 332 |
+
# pdf.update_stream(xobj.xref_id, b'')
|
| 333 |
+
for rect in page.pdf_rectangle:
|
| 334 |
+
self._debug_render_rectangle(page_op, rect)
|
| 335 |
+
draw_op = page_op
|
| 336 |
+
op_container = pdf.get_new_xref()
|
| 337 |
+
# Since this is a draw instruction container,
|
| 338 |
+
# no additional information is needed
|
| 339 |
+
pdf.update_object(op_container, "<<>>")
|
| 340 |
+
pdf.update_stream(op_container, draw_op.tobytes())
|
| 341 |
+
pdf[page.page_number].set_contents(op_container)
|
| 342 |
+
pbar.advance()
|
| 343 |
+
translation_config.raise_if_cancelled()
|
| 344 |
+
with self.translation_config.progress_monitor.stage_start(
|
| 345 |
+
SUBSET_FONT_STAGE_NAME, 1
|
| 346 |
+
) as pbar:
|
| 347 |
+
if not translation_config.skip_clean:
|
| 348 |
+
pdf.subset_fonts(fallback=False)
|
| 349 |
+
pbar.advance()
|
| 350 |
+
with self.translation_config.progress_monitor.stage_start(
|
| 351 |
+
SAVE_PDF_STAGE_NAME, 2
|
| 352 |
+
) as pbar:
|
| 353 |
+
if not translation_config.no_mono:
|
| 354 |
+
if translation_config.debug:
|
| 355 |
+
translation_config.raise_if_cancelled()
|
| 356 |
+
pdf.save(
|
| 357 |
+
f"{mono_out_path}.decompressed.pdf", expand=True, pretty=True
|
| 358 |
+
)
|
| 359 |
+
translation_config.raise_if_cancelled()
|
| 360 |
+
pdf.save(
|
| 361 |
+
mono_out_path,
|
| 362 |
+
garbage=3,
|
| 363 |
+
deflate=True,
|
| 364 |
+
clean=not translation_config.skip_clean,
|
| 365 |
+
deflate_fonts=True,
|
| 366 |
+
linear=True,
|
| 367 |
+
)
|
| 368 |
+
pbar.advance()
|
| 369 |
+
dual_out_path = None
|
| 370 |
+
if not translation_config.no_dual:
|
| 371 |
+
dual_out_path = translation_config.get_output_file_path(
|
| 372 |
+
f"{basename}{debug_suffix}.{translation_config.lang_out}.dual.pdf"
|
| 373 |
+
)
|
| 374 |
+
translation_config.raise_if_cancelled()
|
| 375 |
+
dual = pymupdf.open(self.original_pdf_path)
|
| 376 |
+
if translation_config.debug:
|
| 377 |
+
translation_config.raise_if_cancelled()
|
| 378 |
+
try:
|
| 379 |
+
self.write_debug_info(dual, translation_config)
|
| 380 |
+
except Exception:
|
| 381 |
+
logger.warning(
|
| 382 |
+
"Failed to write debug info to dual PDF", exc_info=True
|
| 383 |
+
)
|
| 384 |
+
dual.insert_file(pdf)
|
| 385 |
+
page_count = pdf.page_count
|
| 386 |
+
for page_id in range(page_count):
|
| 387 |
+
if translation_config.dual_translate_first:
|
| 388 |
+
dual.move_page(page_count + page_id, page_id * 2)
|
| 389 |
+
else:
|
| 390 |
+
dual.move_page(page_count + page_id, page_id * 2 + 1)
|
| 391 |
+
dual.save(
|
| 392 |
+
dual_out_path,
|
| 393 |
+
garbage=3,
|
| 394 |
+
deflate=True,
|
| 395 |
+
clean=not translation_config.skip_clean,
|
| 396 |
+
deflate_fonts=True,
|
| 397 |
+
linear=True,
|
| 398 |
+
)
|
| 399 |
+
if translation_config.debug:
|
| 400 |
+
translation_config.raise_if_cancelled()
|
| 401 |
+
dual.save(
|
| 402 |
+
f"{dual_out_path}.decompressed.pdf", expand=True, pretty=True
|
| 403 |
+
)
|
| 404 |
+
pbar.advance()
|
| 405 |
+
return TranslateResult(mono_out_path, dual_out_path)
|
src/pdf2u/document_il/frontend/__init__.py
ADDED
|
File without changes
|
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (182 Bytes). View file
|
|
|
src/pdf2u/document_il/frontend/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (170 Bytes). View file
|
|
|
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-311.pyc
ADDED
|
Binary file (19 kB). View file
|
|
|
src/pdf2u/document_il/frontend/__pycache__/il_creater.cpython-312.pyc
ADDED
|
Binary file (18 kB). View file
|
|
|
src/pdf2u/document_il/frontend/il_creater.py
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import logging
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
import pdfminer.pdfinterp
|
| 6 |
+
import pymupdf
|
| 7 |
+
from pdfminer.layout import LTChar, LTFigure
|
| 8 |
+
from pdfminer.pdffont import PDFCIDFont, PDFFont
|
| 9 |
+
from pdfminer.psparser import PSLiteral
|
| 10 |
+
|
| 11 |
+
from pdf2u.document_il import il_version_1
|
| 12 |
+
from pdf2u.translation_config import TranslationConfig
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ILCreater:
|
| 18 |
+
stage_name = "Parse PDF and Create Intermediate Representation"
|
| 19 |
+
|
| 20 |
+
def __init__(self, translation_config: TranslationConfig):
|
| 21 |
+
self.progress = None
|
| 22 |
+
self.current_page: il_version_1.Page = None
|
| 23 |
+
self.mupdf: pymupdf.Document = None
|
| 24 |
+
self.model = translation_config.doc_layout_model
|
| 25 |
+
self.docs = il_version_1.Document(page=[])
|
| 26 |
+
self.stroking_color_space_name = None
|
| 27 |
+
self.non_stroking_color_space_name = None
|
| 28 |
+
self.passthrough_per_char_instruction: list[tuple[str, str]] = []
|
| 29 |
+
self.translation_config = translation_config
|
| 30 |
+
self.passthrough_per_char_instruction_stack: list[list[tuple[str, str]]] = []
|
| 31 |
+
self.xobj_id = 0
|
| 32 |
+
self.xobj_inc = 0
|
| 33 |
+
self.xobj_map: dict[int, il_version_1.PdfXobject] = {}
|
| 34 |
+
self.xobj_stack = []
|
| 35 |
+
|
| 36 |
+
def on_finish(self):
|
| 37 |
+
self.progress.__exit__(None, None, None)
|
| 38 |
+
|
| 39 |
+
def is_passthrough_per_char_operation(self, operator: str):
|
| 40 |
+
return re.match("^(sc|scn|g|rg|k|cs|gs|ri)$", operator, re.IGNORECASE)
|
| 41 |
+
|
| 42 |
+
def on_passthrough_per_char(self, operator: str, args: list[str]):
|
| 43 |
+
if not self.is_passthrough_per_char_operation(operator):
|
| 44 |
+
logger.error("Unknown passthrough_per_char operation: %s", operator)
|
| 45 |
+
return
|
| 46 |
+
# logger.debug("xobj_id: %d, on_passthrough_per_char: %s ( %s )", self.xobj_id, operator, args)
|
| 47 |
+
args = [self.parse_arg(arg) for arg in args]
|
| 48 |
+
for _i, value in enumerate(self.passthrough_per_char_instruction.copy()):
|
| 49 |
+
op, arg = value
|
| 50 |
+
if op == operator:
|
| 51 |
+
self.passthrough_per_char_instruction.remove(value)
|
| 52 |
+
break
|
| 53 |
+
self.passthrough_per_char_instruction.append((operator, " ".join(args)))
|
| 54 |
+
pass
|
| 55 |
+
|
| 56 |
+
def remove_latest_passthrough_per_char_instruction(self):
|
| 57 |
+
if self.passthrough_per_char_instruction:
|
| 58 |
+
self.passthrough_per_char_instruction.pop()
|
| 59 |
+
|
| 60 |
+
def parse_arg(self, arg: str):
|
| 61 |
+
if isinstance(arg, PSLiteral):
|
| 62 |
+
return f"/{arg.name}"
|
| 63 |
+
if not isinstance(arg, str):
|
| 64 |
+
return str(arg)
|
| 65 |
+
return arg
|
| 66 |
+
|
| 67 |
+
def pop_passthrough_per_char_instruction(self):
|
| 68 |
+
if self.passthrough_per_char_instruction_stack:
|
| 69 |
+
self.passthrough_per_char_instruction = (
|
| 70 |
+
self.passthrough_per_char_instruction_stack.pop()
|
| 71 |
+
)
|
| 72 |
+
else:
|
| 73 |
+
self.passthrough_per_char_instruction = []
|
| 74 |
+
logging.error(
|
| 75 |
+
"pop_passthrough_per_char_instruction error on page: %s",
|
| 76 |
+
self.current_page.page_number,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def push_passthrough_per_char_instruction(self):
|
| 80 |
+
self.passthrough_per_char_instruction_stack.append(
|
| 81 |
+
self.passthrough_per_char_instruction.copy()
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# pdf32000 page 171
|
| 85 |
+
def on_stroking_color_space(self, color_space_name):
|
| 86 |
+
self.stroking_color_space_name = color_space_name
|
| 87 |
+
|
| 88 |
+
def on_non_stroking_color_space(self, color_space_name):
|
| 89 |
+
self.non_stroking_color_space_name = color_space_name
|
| 90 |
+
|
| 91 |
+
def on_new_stream(self):
|
| 92 |
+
self.stroking_color_space_name = None
|
| 93 |
+
self.non_stroking_color_space_name = None
|
| 94 |
+
self.passthrough_per_char_instruction = []
|
| 95 |
+
|
| 96 |
+
def push_xobj(self):
|
| 97 |
+
self.xobj_stack.append(
|
| 98 |
+
(self.current_page_font_name_id_map.copy(), self.xobj_id)
|
| 99 |
+
)
|
| 100 |
+
self.current_page_font_name_id_map = {}
|
| 101 |
+
|
| 102 |
+
def pop_xobj(self):
|
| 103 |
+
self.current_page_font_name_id_map, self.xobj_id = self.xobj_stack.pop()
|
| 104 |
+
|
| 105 |
+
def on_xobj_begin(self, bbox, xref_id):
|
| 106 |
+
self.push_passthrough_per_char_instruction()
|
| 107 |
+
self.push_xobj()
|
| 108 |
+
self.xobj_inc += 1
|
| 109 |
+
self.xobj_id = self.xobj_inc
|
| 110 |
+
xobject = il_version_1.PdfXobject(
|
| 111 |
+
box=il_version_1.Box(
|
| 112 |
+
x=float(bbox[0]), y=float(bbox[1]), x2=float(bbox[2]), y2=float(bbox[3])
|
| 113 |
+
),
|
| 114 |
+
xobj_id=self.xobj_id,
|
| 115 |
+
xref_id=xref_id,
|
| 116 |
+
)
|
| 117 |
+
self.current_page.pdf_xobject.append(xobject)
|
| 118 |
+
self.xobj_map[self.xobj_id] = xobject
|
| 119 |
+
return self.xobj_id
|
| 120 |
+
|
| 121 |
+
def on_xobj_end(self, xobj_id, base_op):
|
| 122 |
+
self.pop_passthrough_per_char_instruction()
|
| 123 |
+
self.pop_xobj()
|
| 124 |
+
xobj = self.xobj_map[xobj_id]
|
| 125 |
+
xobj.base_operations = il_version_1.BaseOperations(value=base_op)
|
| 126 |
+
self.xobj_inc += 1
|
| 127 |
+
|
| 128 |
+
def on_page_start(self):
|
| 129 |
+
self.current_page = il_version_1.Page(
|
| 130 |
+
pdf_font=[],
|
| 131 |
+
pdf_character=[],
|
| 132 |
+
page_layout=[],
|
| 133 |
+
# currently don't support UserUnit page parameter
|
| 134 |
+
# pdf32000 page 79
|
| 135 |
+
unit="point",
|
| 136 |
+
)
|
| 137 |
+
self.current_page_font_name_id_map = {}
|
| 138 |
+
self.passthrough_per_char_instruction_stack = []
|
| 139 |
+
self.xobj_stack = []
|
| 140 |
+
self.non_stroking_color_space_name = None
|
| 141 |
+
self.stroking_color_space_name = None
|
| 142 |
+
self.docs.page.append(self.current_page)
|
| 143 |
+
|
| 144 |
+
def on_page_end(self):
|
| 145 |
+
self.progress.advance(1)
|
| 146 |
+
|
| 147 |
+
def on_page_crop_box(
|
| 148 |
+
self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
|
| 149 |
+
):
|
| 150 |
+
box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
|
| 151 |
+
self.current_page.cropbox = il_version_1.Cropbox(box=box)
|
| 152 |
+
|
| 153 |
+
def on_page_media_box(
|
| 154 |
+
self, x0: float | int, y0: float | int, x1: float | int, y1: float | int
|
| 155 |
+
):
|
| 156 |
+
box = il_version_1.Box(x=float(x0), y=float(y0), x2=float(x1), y2=float(y1))
|
| 157 |
+
self.current_page.mediabox = il_version_1.Mediabox(box=box)
|
| 158 |
+
|
| 159 |
+
def on_page_number(self, page_number: int):
|
| 160 |
+
assert isinstance(page_number, int)
|
| 161 |
+
assert page_number >= 0
|
| 162 |
+
self.current_page.page_number = page_number
|
| 163 |
+
|
| 164 |
+
def on_page_base_operation(self, operation: str):
|
| 165 |
+
self.current_page.base_operations = il_version_1.BaseOperations(value=operation)
|
| 166 |
+
|
| 167 |
+
def on_page_resource_font(self, font: PDFFont, xref_id: int, font_id: str):
|
| 168 |
+
font_name = font.fontname
|
| 169 |
+
if isinstance(font_name, bytes):
|
| 170 |
+
try:
|
| 171 |
+
font_name = font_name.decode("utf-8")
|
| 172 |
+
except UnicodeDecodeError:
|
| 173 |
+
font_name = "BASE64:" + base64.b64encode(font_name).decode("utf-8")
|
| 174 |
+
encoding_length = 1
|
| 175 |
+
if isinstance(font, PDFCIDFont):
|
| 176 |
+
try:
|
| 177 |
+
# pdf 32000:2008 page 273
|
| 178 |
+
# Table 118 - Predefined CJK CMap names
|
| 179 |
+
_, encoding = self.mupdf.xref_get_key(xref_id, "Encoding")
|
| 180 |
+
if encoding == "/Identity-H" or encoding == "/Identity-V":
|
| 181 |
+
encoding_length = 2
|
| 182 |
+
else:
|
| 183 |
+
_, to_unicode_id = self.mupdf.xref_get_key(xref_id, "ToUnicode")
|
| 184 |
+
to_unicode_bytes = self.mupdf.xref_stream(
|
| 185 |
+
int(to_unicode_id.split(" ")[0])
|
| 186 |
+
)
|
| 187 |
+
code_range = re.search(
|
| 188 |
+
b"begincodespacerange\n?.*<(\\d+?)>.*", to_unicode_bytes
|
| 189 |
+
).group(1)
|
| 190 |
+
encoding_length = len(code_range) // 2
|
| 191 |
+
except Exception:
|
| 192 |
+
if max(font.unicode_map.cid2unichr.keys()) > 255:
|
| 193 |
+
encoding_length = 2
|
| 194 |
+
else:
|
| 195 |
+
encoding_length = 1
|
| 196 |
+
try:
|
| 197 |
+
mupdf_font = pymupdf.Font(fontbuffer=self.mupdf.extract_font(xref_id)[3])
|
| 198 |
+
bold = mupdf_font.is_bold
|
| 199 |
+
italic = mupdf_font.is_italic
|
| 200 |
+
monospaced = mupdf_font.is_monospaced
|
| 201 |
+
serif = mupdf_font.is_serif
|
| 202 |
+
except Exception:
|
| 203 |
+
bold = None
|
| 204 |
+
italic = None
|
| 205 |
+
monospaced = None
|
| 206 |
+
serif = None
|
| 207 |
+
il_font_metadata = il_version_1.PdfFont(
|
| 208 |
+
name=font_name,
|
| 209 |
+
xref_id=xref_id,
|
| 210 |
+
font_id=font_id,
|
| 211 |
+
encoding_length=encoding_length,
|
| 212 |
+
bold=bold,
|
| 213 |
+
italic=italic,
|
| 214 |
+
monospace=monospaced,
|
| 215 |
+
serif=serif,
|
| 216 |
+
ascent=font.ascent,
|
| 217 |
+
descent=font.descent,
|
| 218 |
+
)
|
| 219 |
+
self.current_page_font_name_id_map[font_name] = font_id
|
| 220 |
+
if self.xobj_id in self.xobj_map:
|
| 221 |
+
self.xobj_map[self.xobj_id].pdf_font.append(il_font_metadata)
|
| 222 |
+
else:
|
| 223 |
+
self.current_page.pdf_font.append(il_font_metadata)
|
| 224 |
+
|
| 225 |
+
def create_graphic_state(self, gs: pdfminer.pdfinterp.PDFGraphicState):
|
| 226 |
+
graphic_state = il_version_1.GraphicState()
|
| 227 |
+
for k, v in gs.__dict__.items():
|
| 228 |
+
if v is None:
|
| 229 |
+
continue
|
| 230 |
+
if k in ["scolor", "ncolor"]:
|
| 231 |
+
if isinstance(v, tuple):
|
| 232 |
+
v = list(v)
|
| 233 |
+
else:
|
| 234 |
+
v = [v]
|
| 235 |
+
setattr(graphic_state, k, v)
|
| 236 |
+
continue
|
| 237 |
+
if k == "linewidth":
|
| 238 |
+
graphic_state.linewidth = float(v)
|
| 239 |
+
continue
|
| 240 |
+
continue
|
| 241 |
+
raise NotImplementedError
|
| 242 |
+
|
| 243 |
+
graphic_state.stroking_color_space_name = self.stroking_color_space_name
|
| 244 |
+
graphic_state.non_stroking_color_space_name = self.non_stroking_color_space_name
|
| 245 |
+
|
| 246 |
+
graphic_state.passthrough_per_char_instruction = " ".join(
|
| 247 |
+
f"{arg} {op}" for op, arg in gs.passthrough_instruction
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
return graphic_state
|
| 251 |
+
|
| 252 |
+
def on_lt_char(self, char: LTChar):
|
| 253 |
+
gs = self.create_graphic_state(char.graphicstate)
|
| 254 |
+
# Get font from current page or xobject
|
| 255 |
+
font = None
|
| 256 |
+
for pdf_font in self.xobj_map.get(self.xobj_id, self.current_page).pdf_font:
|
| 257 |
+
if pdf_font.font_id == char.aw_font_id:
|
| 258 |
+
font = pdf_font
|
| 259 |
+
break
|
| 260 |
+
|
| 261 |
+
# Get descent from font
|
| 262 |
+
descent = 0
|
| 263 |
+
if font and hasattr(font, "descent"):
|
| 264 |
+
descent = font.descent * char.size / 1000
|
| 265 |
+
|
| 266 |
+
char_id = char.cid
|
| 267 |
+
char_unicode = char.get_text()
|
| 268 |
+
if "(cid:" not in char_unicode and len(char_unicode) > 1:
|
| 269 |
+
return
|
| 270 |
+
advance = char.adv
|
| 271 |
+
if char.matrix[0] == 0 and char.matrix[3] == 0:
|
| 272 |
+
vertical = True
|
| 273 |
+
bbox = il_version_1.Box(
|
| 274 |
+
x=char.bbox[0] - descent,
|
| 275 |
+
y=char.bbox[1],
|
| 276 |
+
x2=char.bbox[2] - descent,
|
| 277 |
+
y2=char.bbox[3],
|
| 278 |
+
)
|
| 279 |
+
else:
|
| 280 |
+
vertical = False
|
| 281 |
+
# Add descent to y coordinates
|
| 282 |
+
bbox = il_version_1.Box(
|
| 283 |
+
x=char.bbox[0],
|
| 284 |
+
y=char.bbox[1] + descent,
|
| 285 |
+
x2=char.bbox[2],
|
| 286 |
+
y2=char.bbox[3] + descent,
|
| 287 |
+
)
|
| 288 |
+
pdf_style = il_version_1.PdfStyle(
|
| 289 |
+
font_id=char.aw_font_id, font_size=char.size, graphic_state=gs
|
| 290 |
+
)
|
| 291 |
+
pdf_char = il_version_1.PdfCharacter(
|
| 292 |
+
box=bbox,
|
| 293 |
+
pdf_character_id=char_id,
|
| 294 |
+
advance=advance,
|
| 295 |
+
char_unicode=char_unicode,
|
| 296 |
+
vertical=vertical,
|
| 297 |
+
pdf_style=pdf_style,
|
| 298 |
+
xobj_id=char.xobj_id,
|
| 299 |
+
)
|
| 300 |
+
self.current_page.pdf_character.append(pdf_char)
|
| 301 |
+
|
| 302 |
+
def create_il(self):
|
| 303 |
+
pages = [
|
| 304 |
+
page
|
| 305 |
+
for page in self.docs.page
|
| 306 |
+
if self.translation_config.should_translate_page(page.page_number + 1)
|
| 307 |
+
]
|
| 308 |
+
self.docs.page = pages
|
| 309 |
+
return self.docs
|
| 310 |
+
|
| 311 |
+
def on_total_pages(self, total_pages: int):
|
| 312 |
+
assert isinstance(total_pages, int)
|
| 313 |
+
assert total_pages > 0
|
| 314 |
+
self.docs.total_pages = total_pages
|
| 315 |
+
total = 0
|
| 316 |
+
for page in range(total_pages):
|
| 317 |
+
if self.translation_config.should_translate_page(page + 1) is False:
|
| 318 |
+
continue
|
| 319 |
+
total += 1
|
| 320 |
+
self.progress = self.translation_config.progress_monitor.stage_start(
|
| 321 |
+
self.stage_name, total
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
def on_pdf_figure(self, figure: LTFigure):
|
| 325 |
+
box = il_version_1.Box(
|
| 326 |
+
figure.bbox[0], figure.bbox[1], figure.bbox[2], figure.bbox[3]
|
| 327 |
+
)
|
| 328 |
+
self.current_page.pdf_figure.append(il_version_1.PdfFigure(box=box))
|
src/pdf2u/document_il/il_version_1.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class BaseOperations:
|
| 6 |
+
class Meta:
|
| 7 |
+
name = "baseOperations"
|
| 8 |
+
|
| 9 |
+
value: str = field(default="", metadata={"required": True})
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class Box:
|
| 14 |
+
class Meta:
|
| 15 |
+
name = "box"
|
| 16 |
+
|
| 17 |
+
x: float | None = field(
|
| 18 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 19 |
+
)
|
| 20 |
+
y: float | None = field(
|
| 21 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 22 |
+
)
|
| 23 |
+
x2: float | None = field(
|
| 24 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 25 |
+
)
|
| 26 |
+
y2: float | None = field(
|
| 27 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class GraphicState:
|
| 33 |
+
class Meta:
|
| 34 |
+
name = "graphicState"
|
| 35 |
+
|
| 36 |
+
linewidth: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 37 |
+
dash: list[float] = field(
|
| 38 |
+
default_factory=list,
|
| 39 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
| 40 |
+
)
|
| 41 |
+
flatness: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 42 |
+
intent: str | None = field(default=None, metadata={"type": "Attribute"})
|
| 43 |
+
linecap: int | None = field(default=None, metadata={"type": "Attribute"})
|
| 44 |
+
linejoin: int | None = field(default=None, metadata={"type": "Attribute"})
|
| 45 |
+
miterlimit: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 46 |
+
ncolor: list[float] = field(
|
| 47 |
+
default_factory=list,
|
| 48 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
| 49 |
+
)
|
| 50 |
+
scolor: list[float] = field(
|
| 51 |
+
default_factory=list,
|
| 52 |
+
metadata={"type": "Attribute", "min_length": 1, "tokens": True},
|
| 53 |
+
)
|
| 54 |
+
stroking_color_space_name: str | None = field(
|
| 55 |
+
default=None, metadata={"name": "strokingColorSpaceName", "type": "Attribute"}
|
| 56 |
+
)
|
| 57 |
+
non_stroking_color_space_name: str | None = field(
|
| 58 |
+
default=None,
|
| 59 |
+
metadata={"name": "nonStrokingColorSpaceName", "type": "Attribute"},
|
| 60 |
+
)
|
| 61 |
+
passthrough_per_char_instruction: str | None = field(
|
| 62 |
+
default=None,
|
| 63 |
+
metadata={"name": "passthroughPerCharInstruction", "type": "Attribute"},
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class PdfFont:
|
| 69 |
+
class Meta:
|
| 70 |
+
name = "pdfFont"
|
| 71 |
+
|
| 72 |
+
name: str | None = field(
|
| 73 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 74 |
+
)
|
| 75 |
+
font_id: str | None = field(
|
| 76 |
+
default=None, metadata={"name": "fontId", "type": "Attribute", "required": True}
|
| 77 |
+
)
|
| 78 |
+
xref_id: int | None = field(
|
| 79 |
+
default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
|
| 80 |
+
)
|
| 81 |
+
encoding_length: int | None = field(
|
| 82 |
+
default=None,
|
| 83 |
+
metadata={"name": "encodingLength", "type": "Attribute", "required": True},
|
| 84 |
+
)
|
| 85 |
+
bold: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 86 |
+
italic: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 87 |
+
monospace: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 88 |
+
serif: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 89 |
+
ascent: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 90 |
+
descent: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@dataclass
|
| 94 |
+
class Cropbox:
|
| 95 |
+
class Meta:
|
| 96 |
+
name = "cropbox"
|
| 97 |
+
|
| 98 |
+
box: Box | None = field(
|
| 99 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@dataclass
|
| 104 |
+
class Mediabox:
|
| 105 |
+
class Meta:
|
| 106 |
+
name = "mediabox"
|
| 107 |
+
|
| 108 |
+
box: Box | None = field(
|
| 109 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
@dataclass
|
| 114 |
+
class PageLayout:
|
| 115 |
+
class Meta:
|
| 116 |
+
name = "pageLayout"
|
| 117 |
+
|
| 118 |
+
box: Box | None = field(
|
| 119 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 120 |
+
)
|
| 121 |
+
id: int | None = field(
|
| 122 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 123 |
+
)
|
| 124 |
+
conf: float | None = field(
|
| 125 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 126 |
+
)
|
| 127 |
+
class_name: str | None = field(
|
| 128 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class PdfFigure:
|
| 134 |
+
class Meta:
|
| 135 |
+
name = "pdfFigure"
|
| 136 |
+
|
| 137 |
+
box: Box | None = field(
|
| 138 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
@dataclass
|
| 143 |
+
class PdfRectangle:
|
| 144 |
+
class Meta:
|
| 145 |
+
name = "pdfRectangle"
|
| 146 |
+
|
| 147 |
+
box: Box | None = field(
|
| 148 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 149 |
+
)
|
| 150 |
+
graphic_state: GraphicState | None = field(
|
| 151 |
+
default=None,
|
| 152 |
+
metadata={"name": "graphicState", "type": "Element", "required": True},
|
| 153 |
+
)
|
| 154 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@dataclass
|
| 158 |
+
class PdfStyle:
|
| 159 |
+
class Meta:
|
| 160 |
+
name = "pdfStyle"
|
| 161 |
+
|
| 162 |
+
graphic_state: GraphicState | None = field(
|
| 163 |
+
default=None,
|
| 164 |
+
metadata={"name": "graphicState", "type": "Element", "required": True},
|
| 165 |
+
)
|
| 166 |
+
font_id: str | None = field(
|
| 167 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 168 |
+
)
|
| 169 |
+
font_size: float | None = field(
|
| 170 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
@dataclass
|
| 175 |
+
class PdfXobject:
|
| 176 |
+
class Meta:
|
| 177 |
+
name = "pdfXobject"
|
| 178 |
+
|
| 179 |
+
box: Box | None = field(
|
| 180 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 181 |
+
)
|
| 182 |
+
pdf_font: list[PdfFont] = field(
|
| 183 |
+
default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
|
| 184 |
+
)
|
| 185 |
+
base_operations: BaseOperations | None = field(
|
| 186 |
+
default=None,
|
| 187 |
+
metadata={"name": "baseOperations", "type": "Element", "required": True},
|
| 188 |
+
)
|
| 189 |
+
xobj_id: int | None = field(
|
| 190 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute", "required": True}
|
| 191 |
+
)
|
| 192 |
+
xref_id: int | None = field(
|
| 193 |
+
default=None, metadata={"name": "xrefId", "type": "Attribute", "required": True}
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
@dataclass
|
| 198 |
+
class PdfCharacter:
|
| 199 |
+
class Meta:
|
| 200 |
+
name = "pdfCharacter"
|
| 201 |
+
|
| 202 |
+
pdf_style: PdfStyle | None = field(
|
| 203 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
| 204 |
+
)
|
| 205 |
+
box: Box | None = field(
|
| 206 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 207 |
+
)
|
| 208 |
+
vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 209 |
+
scale: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 210 |
+
pdf_character_id: int | None = field(
|
| 211 |
+
default=None, metadata={"name": "pdfCharacterId", "type": "Attribute"}
|
| 212 |
+
)
|
| 213 |
+
char_unicode: str | None = field(
|
| 214 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 215 |
+
)
|
| 216 |
+
advance: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 217 |
+
xobj_id: int | None = field(
|
| 218 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute"}
|
| 219 |
+
)
|
| 220 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
@dataclass
|
| 224 |
+
class PdfSameStyleUnicodeCharacters:
|
| 225 |
+
class Meta:
|
| 226 |
+
name = "pdfSameStyleUnicodeCharacters"
|
| 227 |
+
|
| 228 |
+
pdf_style: PdfStyle | None = field(
|
| 229 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element"}
|
| 230 |
+
)
|
| 231 |
+
unicode: str | None = field(
|
| 232 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 233 |
+
)
|
| 234 |
+
debug_info: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
@dataclass
|
| 238 |
+
class PdfFormula:
|
| 239 |
+
class Meta:
|
| 240 |
+
name = "pdfFormula"
|
| 241 |
+
|
| 242 |
+
box: Box | None = field(
|
| 243 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 244 |
+
)
|
| 245 |
+
pdf_character: list[PdfCharacter] = field(
|
| 246 |
+
default_factory=list,
|
| 247 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
| 248 |
+
)
|
| 249 |
+
x_offset: float | None = field(
|
| 250 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 251 |
+
)
|
| 252 |
+
y_offset: float | None = field(
|
| 253 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
@dataclass
|
| 258 |
+
class PdfLine:
|
| 259 |
+
class Meta:
|
| 260 |
+
name = "pdfLine"
|
| 261 |
+
|
| 262 |
+
box: Box | None = field(
|
| 263 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 264 |
+
)
|
| 265 |
+
pdf_character: list[PdfCharacter] = field(
|
| 266 |
+
default_factory=list,
|
| 267 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
@dataclass
|
| 272 |
+
class PdfSameStyleCharacters:
|
| 273 |
+
class Meta:
|
| 274 |
+
name = "pdfSameStyleCharacters"
|
| 275 |
+
|
| 276 |
+
box: Box | None = field(
|
| 277 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 278 |
+
)
|
| 279 |
+
pdf_style: PdfStyle | None = field(
|
| 280 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
| 281 |
+
)
|
| 282 |
+
pdf_character: list[PdfCharacter] = field(
|
| 283 |
+
default_factory=list,
|
| 284 |
+
metadata={"name": "pdfCharacter", "type": "Element", "min_occurs": 1},
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
@dataclass
|
| 289 |
+
class PdfParagraphComposition:
|
| 290 |
+
class Meta:
|
| 291 |
+
name = "pdfParagraphComposition"
|
| 292 |
+
|
| 293 |
+
pdf_line: PdfLine | None = field(
|
| 294 |
+
default=None, metadata={"name": "pdfLine", "type": "Element"}
|
| 295 |
+
)
|
| 296 |
+
pdf_formula: PdfFormula | None = field(
|
| 297 |
+
default=None, metadata={"name": "pdfFormula", "type": "Element"}
|
| 298 |
+
)
|
| 299 |
+
pdf_same_style_characters: PdfSameStyleCharacters | None = field(
|
| 300 |
+
default=None, metadata={"name": "pdfSameStyleCharacters", "type": "Element"}
|
| 301 |
+
)
|
| 302 |
+
pdf_character: PdfCharacter | None = field(
|
| 303 |
+
default=None, metadata={"name": "pdfCharacter", "type": "Element"}
|
| 304 |
+
)
|
| 305 |
+
pdf_same_style_unicode_characters: PdfSameStyleUnicodeCharacters | None = field(
|
| 306 |
+
default=None,
|
| 307 |
+
metadata={"name": "pdfSameStyleUnicodeCharacters", "type": "Element"},
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@dataclass
|
| 312 |
+
class PdfParagraph:
|
| 313 |
+
class Meta:
|
| 314 |
+
name = "pdfParagraph"
|
| 315 |
+
|
| 316 |
+
box: Box | None = field(
|
| 317 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 318 |
+
)
|
| 319 |
+
pdf_style: PdfStyle | None = field(
|
| 320 |
+
default=None, metadata={"name": "pdfStyle", "type": "Element", "required": True}
|
| 321 |
+
)
|
| 322 |
+
pdf_paragraph_composition: list[PdfParagraphComposition] = field(
|
| 323 |
+
default_factory=list,
|
| 324 |
+
metadata={"name": "pdfParagraphComposition", "type": "Element"},
|
| 325 |
+
)
|
| 326 |
+
xobj_id: int | None = field(
|
| 327 |
+
default=None, metadata={"name": "xobjId", "type": "Attribute"}
|
| 328 |
+
)
|
| 329 |
+
unicode: str | None = field(
|
| 330 |
+
default=None, metadata={"type": "Attribute", "required": True}
|
| 331 |
+
)
|
| 332 |
+
scale: float | None = field(default=None, metadata={"type": "Attribute"})
|
| 333 |
+
vertical: bool | None = field(default=None, metadata={"type": "Attribute"})
|
| 334 |
+
first_line_indent: bool | None = field(
|
| 335 |
+
default=None, metadata={"name": "FirstLineIndent", "type": "Attribute"}
|
| 336 |
+
)
|
| 337 |
+
debug_id: str | None = field(default=None, metadata={"type": "Attribute"})
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
@dataclass
|
| 341 |
+
class Page:
|
| 342 |
+
class Meta:
|
| 343 |
+
name = "page"
|
| 344 |
+
|
| 345 |
+
mediabox: Mediabox | None = field(
|
| 346 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 347 |
+
)
|
| 348 |
+
cropbox: Cropbox | None = field(
|
| 349 |
+
default=None, metadata={"type": "Element", "required": True}
|
| 350 |
+
)
|
| 351 |
+
pdf_xobject: list[PdfXobject] = field(
|
| 352 |
+
default_factory=list, metadata={"name": "pdfXobject", "type": "Element"}
|
| 353 |
+
)
|
| 354 |
+
page_layout: list[PageLayout] = field(
|
| 355 |
+
default_factory=list, metadata={"name": "pageLayout", "type": "Element"}
|
| 356 |
+
)
|
| 357 |
+
pdf_rectangle: list[PdfRectangle] = field(
|
| 358 |
+
default_factory=list, metadata={"name": "pdfRectangle", "type": "Element"}
|
| 359 |
+
)
|
| 360 |
+
pdf_font: list[PdfFont] = field(
|
| 361 |
+
default_factory=list, metadata={"name": "pdfFont", "type": "Element"}
|
| 362 |
+
)
|
| 363 |
+
pdf_paragraph: list[PdfParagraph] = field(
|
| 364 |
+
default_factory=list, metadata={"name": "pdfParagraph", "type": "Element"}
|
| 365 |
+
)
|
| 366 |
+
pdf_figure: list[PdfFigure] = field(
|
| 367 |
+
default_factory=list, metadata={"name": "pdfFigure", "type": "Element"}
|
| 368 |
+
)
|
| 369 |
+
pdf_character: list[PdfCharacter] = field(
|
| 370 |
+
default_factory=list, metadata={"name": "pdfCharacter", "type": "Element"}
|
| 371 |
+
)
|
| 372 |
+
base_operations: BaseOperations | None = field(
|
| 373 |
+
default=None,
|
| 374 |
+
metadata={"name": "baseOperations", "type": "Element", "required": True},
|
| 375 |
+
)
|
| 376 |
+
page_number: int | None = field(
|
| 377 |
+
default=None,
|
| 378 |
+
metadata={"name": "pageNumber", "type": "Attribute", "required": True},
|
| 379 |
+
)
|
| 380 |
+
unit: str | None = field(
|
| 381 |
+
default=None, metadata={"name": "Unit", "type": "Attribute", "required": True}
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
@dataclass
|
| 386 |
+
class Document:
|
| 387 |
+
class Meta:
|
| 388 |
+
name = "document"
|
| 389 |
+
|
| 390 |
+
page: list[Page] = field(
|
| 391 |
+
default_factory=list, metadata={"type": "Element", "min_occurs": 1}
|
| 392 |
+
)
|
| 393 |
+
total_pages: int | None = field(
|
| 394 |
+
default=None,
|
| 395 |
+
metadata={"name": "totalPages", "type": "Attribute", "required": True},
|
| 396 |
+
)
|
src/pdf2u/document_il/il_version_1.rnc
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
start = Document
|
| 2 |
+
Document =
|
| 3 |
+
element document {
|
| 4 |
+
Page+,
|
| 5 |
+
attribute totalPages { xsd:int }
|
| 6 |
+
}
|
| 7 |
+
Page =
|
| 8 |
+
element page {
|
| 9 |
+
element mediabox { Box },
|
| 10 |
+
element cropbox { Box },
|
| 11 |
+
PDFXobject*,
|
| 12 |
+
PageLayout*,
|
| 13 |
+
PDFRectangle*,
|
| 14 |
+
PDFFont*,
|
| 15 |
+
PDFParagraph*,
|
| 16 |
+
PDFFigure*,
|
| 17 |
+
PDFCharacter*,
|
| 18 |
+
attribute pageNumber { xsd:int },
|
| 19 |
+
attribute Unit { xsd:string },
|
| 20 |
+
element baseOperations { xsd:string }
|
| 21 |
+
}
|
| 22 |
+
Box =
|
| 23 |
+
element box {
|
| 24 |
+
# from (x,y) to (x2,y2)
|
| 25 |
+
attribute x { xsd:float },
|
| 26 |
+
attribute y { xsd:float },
|
| 27 |
+
attribute x2 { xsd:float },
|
| 28 |
+
attribute y2 { xsd:float }
|
| 29 |
+
}
|
| 30 |
+
PDFXrefId = xsd:int
|
| 31 |
+
PDFFont =
|
| 32 |
+
element pdfFont {
|
| 33 |
+
attribute name { xsd:string },
|
| 34 |
+
attribute fontId { xsd:string },
|
| 35 |
+
attribute xrefId { PDFXrefId },
|
| 36 |
+
attribute encodingLength { xsd:int },
|
| 37 |
+
attribute bold { xsd:boolean }?,
|
| 38 |
+
attribute italic { xsd:boolean }?,
|
| 39 |
+
attribute monospace { xsd:boolean }?,
|
| 40 |
+
attribute serif { xsd:boolean }?,
|
| 41 |
+
attribute ascent { xsd:float }?,
|
| 42 |
+
attribute descent { xsd:float }?
|
| 43 |
+
}
|
| 44 |
+
PDFXobject =
|
| 45 |
+
element pdfXobject {
|
| 46 |
+
attribute xobjId { xsd:int },
|
| 47 |
+
attribute xrefId { PDFXrefId },
|
| 48 |
+
Box,
|
| 49 |
+
PDFFont*,
|
| 50 |
+
element baseOperations { xsd:string }
|
| 51 |
+
}
|
| 52 |
+
PDFCharacter =
|
| 53 |
+
element pdfCharacter {
|
| 54 |
+
attribute vertical { xsd:boolean }?,
|
| 55 |
+
attribute scale { xsd:float }?,
|
| 56 |
+
attribute pdfCharacterId { xsd:int }?,
|
| 57 |
+
attribute char_unicode { xsd:string },
|
| 58 |
+
attribute advance { xsd:float }?,
|
| 59 |
+
# xobject nesting depth
|
| 60 |
+
attribute xobjId { xsd:int }?,
|
| 61 |
+
attribute debug_info { xsd:boolean }?,
|
| 62 |
+
PDFStyle,
|
| 63 |
+
Box
|
| 64 |
+
}
|
| 65 |
+
PageLayout =
|
| 66 |
+
element pageLayout {
|
| 67 |
+
attribute id { xsd:int },
|
| 68 |
+
attribute conf { xsd:float },
|
| 69 |
+
attribute class_name { xsd:string },
|
| 70 |
+
Box
|
| 71 |
+
}
|
| 72 |
+
GraphicState =
|
| 73 |
+
element graphicState {
|
| 74 |
+
attribute linewidth { xsd:float }?,
|
| 75 |
+
attribute dash {
|
| 76 |
+
list { xsd:float+ }
|
| 77 |
+
}?,
|
| 78 |
+
attribute flatness { xsd:float }?,
|
| 79 |
+
attribute intent { xsd:string }?,
|
| 80 |
+
attribute linecap { xsd:int }?,
|
| 81 |
+
attribute linejoin { xsd:int }?,
|
| 82 |
+
attribute miterlimit { xsd:float }?,
|
| 83 |
+
attribute ncolor {
|
| 84 |
+
list { xsd:float+ }
|
| 85 |
+
}?,
|
| 86 |
+
attribute scolor {
|
| 87 |
+
list { xsd:float+ }
|
| 88 |
+
}?,
|
| 89 |
+
attribute strokingColorSpaceName { xsd:string }?,
|
| 90 |
+
attribute nonStrokingColorSpaceName { xsd:string }?,
|
| 91 |
+
attribute passthroughPerCharInstruction { xsd:string }?
|
| 92 |
+
}
|
| 93 |
+
PDFStyle =
|
| 94 |
+
element pdfStyle {
|
| 95 |
+
attribute font_id { xsd:string },
|
| 96 |
+
attribute font_size { xsd:float },
|
| 97 |
+
GraphicState
|
| 98 |
+
}
|
| 99 |
+
PDFParagraph =
|
| 100 |
+
element pdfParagraph {
|
| 101 |
+
attribute xobjId { xsd:int }?,
|
| 102 |
+
attribute unicode { xsd:string },
|
| 103 |
+
attribute scale { xsd:float }?,
|
| 104 |
+
attribute vertical { xsd:boolean }?,
|
| 105 |
+
attribute FirstLineIndent { xsd:boolean }?,
|
| 106 |
+
attribute debug_id { xsd:string }?,
|
| 107 |
+
Box,
|
| 108 |
+
PDFStyle,
|
| 109 |
+
PDFParagraphComposition*
|
| 110 |
+
}
|
| 111 |
+
PDFParagraphComposition =
|
| 112 |
+
element pdfParagraphComposition {
|
| 113 |
+
PDFLine
|
| 114 |
+
| PDFFormula
|
| 115 |
+
| PDFSameStyleCharacters
|
| 116 |
+
| PDFCharacter
|
| 117 |
+
| PDFSameStyleUnicodeCharacters
|
| 118 |
+
}
|
| 119 |
+
PDFLine = element pdfLine { Box, PDFCharacter+ }
|
| 120 |
+
PDFSameStyleCharacters =
|
| 121 |
+
element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
|
| 122 |
+
PDFSameStyleUnicodeCharacters =
|
| 123 |
+
element pdfSameStyleUnicodeCharacters {
|
| 124 |
+
PDFStyle?,
|
| 125 |
+
attribute unicode { xsd:string },
|
| 126 |
+
attribute debug_info { xsd:boolean }?
|
| 127 |
+
}
|
| 128 |
+
PDFFormula =
|
| 129 |
+
element pdfFormula {
|
| 130 |
+
Box,
|
| 131 |
+
PDFCharacter+,
|
| 132 |
+
attribute x_offset { xsd:float },
|
| 133 |
+
attribute y_offset { xsd:float }
|
| 134 |
+
}
|
| 135 |
+
PDFFigure = element pdfFigure { Box }
|
| 136 |
+
PDFRectangle =
|
| 137 |
+
element pdfRectangle {
|
| 138 |
+
Box,
|
| 139 |
+
GraphicState,
|
| 140 |
+
attribute debug_info { xsd:boolean }?
|
| 141 |
+
}
|
src/pdf2u/document_il/il_version_1.rng
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<grammar xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
|
| 3 |
+
<start>
|
| 4 |
+
<ref name="Document"/>
|
| 5 |
+
</start>
|
| 6 |
+
<define name="Document">
|
| 7 |
+
<element name="document">
|
| 8 |
+
<oneOrMore>
|
| 9 |
+
<ref name="Page"/>
|
| 10 |
+
</oneOrMore>
|
| 11 |
+
<attribute name="totalPages">
|
| 12 |
+
<data type="int"/>
|
| 13 |
+
</attribute>
|
| 14 |
+
</element>
|
| 15 |
+
</define>
|
| 16 |
+
<define name="Page">
|
| 17 |
+
<element name="page">
|
| 18 |
+
<element name="mediabox">
|
| 19 |
+
<ref name="Box"/>
|
| 20 |
+
</element>
|
| 21 |
+
<element name="cropbox">
|
| 22 |
+
<ref name="Box"/>
|
| 23 |
+
</element>
|
| 24 |
+
<zeroOrMore>
|
| 25 |
+
<ref name="PDFXobject"/>
|
| 26 |
+
</zeroOrMore>
|
| 27 |
+
<zeroOrMore>
|
| 28 |
+
<ref name="PageLayout"/>
|
| 29 |
+
</zeroOrMore>
|
| 30 |
+
<zeroOrMore>
|
| 31 |
+
<ref name="PDFRectangle"/>
|
| 32 |
+
</zeroOrMore>
|
| 33 |
+
<zeroOrMore>
|
| 34 |
+
<ref name="PDFFont"/>
|
| 35 |
+
</zeroOrMore>
|
| 36 |
+
<zeroOrMore>
|
| 37 |
+
<ref name="PDFParagraph"/>
|
| 38 |
+
</zeroOrMore>
|
| 39 |
+
<zeroOrMore>
|
| 40 |
+
<ref name="PDFFigure"/>
|
| 41 |
+
</zeroOrMore>
|
| 42 |
+
<zeroOrMore>
|
| 43 |
+
<ref name="PDFCharacter"/>
|
| 44 |
+
</zeroOrMore>
|
| 45 |
+
<attribute name="pageNumber">
|
| 46 |
+
<data type="int"/>
|
| 47 |
+
</attribute>
|
| 48 |
+
<attribute name="Unit">
|
| 49 |
+
<data type="string"/>
|
| 50 |
+
</attribute>
|
| 51 |
+
<element name="baseOperations">
|
| 52 |
+
<data type="string"/>
|
| 53 |
+
</element>
|
| 54 |
+
</element>
|
| 55 |
+
</define>
|
| 56 |
+
<define name="Box">
|
| 57 |
+
<element name="box">
|
| 58 |
+
<!-- from (x,y) to (x2,y2) -->
|
| 59 |
+
<attribute name="x">
|
| 60 |
+
<data type="float"/>
|
| 61 |
+
</attribute>
|
| 62 |
+
<attribute name="y">
|
| 63 |
+
<data type="float"/>
|
| 64 |
+
</attribute>
|
| 65 |
+
<attribute name="x2">
|
| 66 |
+
<data type="float"/>
|
| 67 |
+
</attribute>
|
| 68 |
+
<attribute name="y2">
|
| 69 |
+
<data type="float"/>
|
| 70 |
+
</attribute>
|
| 71 |
+
</element>
|
| 72 |
+
</define>
|
| 73 |
+
<define name="PDFXrefId">
|
| 74 |
+
<data type="int"/>
|
| 75 |
+
</define>
|
| 76 |
+
<define name="PDFFont">
|
| 77 |
+
<element name="pdfFont">
|
| 78 |
+
<attribute name="name">
|
| 79 |
+
<data type="string"/>
|
| 80 |
+
</attribute>
|
| 81 |
+
<attribute name="fontId">
|
| 82 |
+
<data type="string"/>
|
| 83 |
+
</attribute>
|
| 84 |
+
<attribute name="xrefId">
|
| 85 |
+
<ref name="PDFXrefId"/>
|
| 86 |
+
</attribute>
|
| 87 |
+
<attribute name="encodingLength">
|
| 88 |
+
<data type="int"/>
|
| 89 |
+
</attribute>
|
| 90 |
+
<optional>
|
| 91 |
+
<attribute name="bold">
|
| 92 |
+
<data type="boolean"/>
|
| 93 |
+
</attribute>
|
| 94 |
+
</optional>
|
| 95 |
+
<optional>
|
| 96 |
+
<attribute name="italic">
|
| 97 |
+
<data type="boolean"/>
|
| 98 |
+
</attribute>
|
| 99 |
+
</optional>
|
| 100 |
+
<optional>
|
| 101 |
+
<attribute name="monospace">
|
| 102 |
+
<data type="boolean"/>
|
| 103 |
+
</attribute>
|
| 104 |
+
</optional>
|
| 105 |
+
<optional>
|
| 106 |
+
<attribute name="serif">
|
| 107 |
+
<data type="boolean"/>
|
| 108 |
+
</attribute>
|
| 109 |
+
</optional>
|
| 110 |
+
<optional>
|
| 111 |
+
<attribute name="ascent">
|
| 112 |
+
<data type="float"/>
|
| 113 |
+
</attribute>
|
| 114 |
+
</optional>
|
| 115 |
+
<optional>
|
| 116 |
+
<attribute name="descent">
|
| 117 |
+
<data type="float"/>
|
| 118 |
+
</attribute>
|
| 119 |
+
</optional>
|
| 120 |
+
</element>
|
| 121 |
+
</define>
|
| 122 |
+
<define name="PDFXobject">
|
| 123 |
+
<element name="pdfXobject">
|
| 124 |
+
<attribute name="xobjId">
|
| 125 |
+
<data type="int"/>
|
| 126 |
+
</attribute>
|
| 127 |
+
<attribute name="xrefId">
|
| 128 |
+
<ref name="PDFXrefId"/>
|
| 129 |
+
</attribute>
|
| 130 |
+
<ref name="Box"/>
|
| 131 |
+
<zeroOrMore>
|
| 132 |
+
<ref name="PDFFont"/>
|
| 133 |
+
</zeroOrMore>
|
| 134 |
+
<element name="baseOperations">
|
| 135 |
+
<data type="string"/>
|
| 136 |
+
</element>
|
| 137 |
+
</element>
|
| 138 |
+
</define>
|
| 139 |
+
<define name="PDFCharacter">
|
| 140 |
+
<element name="pdfCharacter">
|
| 141 |
+
<optional>
|
| 142 |
+
<attribute name="vertical">
|
| 143 |
+
<data type="boolean"/>
|
| 144 |
+
</attribute>
|
| 145 |
+
</optional>
|
| 146 |
+
<optional>
|
| 147 |
+
<attribute name="scale">
|
| 148 |
+
<data type="float"/>
|
| 149 |
+
</attribute>
|
| 150 |
+
</optional>
|
| 151 |
+
<optional>
|
| 152 |
+
<attribute name="pdfCharacterId">
|
| 153 |
+
<data type="int"/>
|
| 154 |
+
</attribute>
|
| 155 |
+
</optional>
|
| 156 |
+
<attribute name="char_unicode">
|
| 157 |
+
<data type="string"/>
|
| 158 |
+
</attribute>
|
| 159 |
+
<optional>
|
| 160 |
+
<attribute name="advance">
|
| 161 |
+
<data type="float"/>
|
| 162 |
+
</attribute>
|
| 163 |
+
</optional>
|
| 164 |
+
<optional>
|
| 165 |
+
<!-- xobject nesting depth -->
|
| 166 |
+
<attribute name="xobjId">
|
| 167 |
+
<data type="int"/>
|
| 168 |
+
</attribute>
|
| 169 |
+
</optional>
|
| 170 |
+
<optional>
|
| 171 |
+
<attribute name="debug_info">
|
| 172 |
+
<data type="boolean"/>
|
| 173 |
+
</attribute>
|
| 174 |
+
</optional>
|
| 175 |
+
<ref name="PDFStyle"/>
|
| 176 |
+
<ref name="Box"/>
|
| 177 |
+
</element>
|
| 178 |
+
</define>
|
| 179 |
+
<define name="PageLayout">
|
| 180 |
+
<element name="pageLayout">
|
| 181 |
+
<attribute name="id">
|
| 182 |
+
<data type="int"/>
|
| 183 |
+
</attribute>
|
| 184 |
+
<attribute name="conf">
|
| 185 |
+
<data type="float"/>
|
| 186 |
+
</attribute>
|
| 187 |
+
<attribute name="class_name">
|
| 188 |
+
<data type="string"/>
|
| 189 |
+
</attribute>
|
| 190 |
+
<ref name="Box"/>
|
| 191 |
+
</element>
|
| 192 |
+
</define>
|
| 193 |
+
<define name="GraphicState">
|
| 194 |
+
<element name="graphicState">
|
| 195 |
+
<optional>
|
| 196 |
+
<attribute name="linewidth">
|
| 197 |
+
<data type="float"/>
|
| 198 |
+
</attribute>
|
| 199 |
+
</optional>
|
| 200 |
+
<optional>
|
| 201 |
+
<attribute name="dash">
|
| 202 |
+
<list>
|
| 203 |
+
<oneOrMore>
|
| 204 |
+
<data type="float"/>
|
| 205 |
+
</oneOrMore>
|
| 206 |
+
</list>
|
| 207 |
+
</attribute>
|
| 208 |
+
</optional>
|
| 209 |
+
<optional>
|
| 210 |
+
<attribute name="flatness">
|
| 211 |
+
<data type="float"/>
|
| 212 |
+
</attribute>
|
| 213 |
+
</optional>
|
| 214 |
+
<optional>
|
| 215 |
+
<attribute name="intent">
|
| 216 |
+
<data type="string"/>
|
| 217 |
+
</attribute>
|
| 218 |
+
</optional>
|
| 219 |
+
<optional>
|
| 220 |
+
<attribute name="linecap">
|
| 221 |
+
<data type="int"/>
|
| 222 |
+
</attribute>
|
| 223 |
+
</optional>
|
| 224 |
+
<optional>
|
| 225 |
+
<attribute name="linejoin">
|
| 226 |
+
<data type="int"/>
|
| 227 |
+
</attribute>
|
| 228 |
+
</optional>
|
| 229 |
+
<optional>
|
| 230 |
+
<attribute name="miterlimit">
|
| 231 |
+
<data type="float"/>
|
| 232 |
+
</attribute>
|
| 233 |
+
</optional>
|
| 234 |
+
<optional>
|
| 235 |
+
<attribute name="ncolor">
|
| 236 |
+
<list>
|
| 237 |
+
<oneOrMore>
|
| 238 |
+
<data type="float"/>
|
| 239 |
+
</oneOrMore>
|
| 240 |
+
</list>
|
| 241 |
+
</attribute>
|
| 242 |
+
</optional>
|
| 243 |
+
<optional>
|
| 244 |
+
<attribute name="scolor">
|
| 245 |
+
<list>
|
| 246 |
+
<oneOrMore>
|
| 247 |
+
<data type="float"/>
|
| 248 |
+
</oneOrMore>
|
| 249 |
+
</list>
|
| 250 |
+
</attribute>
|
| 251 |
+
</optional>
|
| 252 |
+
<optional>
|
| 253 |
+
<attribute name="strokingColorSpaceName">
|
| 254 |
+
<data type="string"/>
|
| 255 |
+
</attribute>
|
| 256 |
+
</optional>
|
| 257 |
+
<optional>
|
| 258 |
+
<attribute name="nonStrokingColorSpaceName">
|
| 259 |
+
<data type="string"/>
|
| 260 |
+
</attribute>
|
| 261 |
+
</optional>
|
| 262 |
+
<optional>
|
| 263 |
+
<attribute name="passthroughPerCharInstruction">
|
| 264 |
+
<data type="string"/>
|
| 265 |
+
</attribute>
|
| 266 |
+
</optional>
|
| 267 |
+
</element>
|
| 268 |
+
</define>
|
| 269 |
+
<define name="PDFStyle">
|
| 270 |
+
<element name="pdfStyle">
|
| 271 |
+
<attribute name="font_id">
|
| 272 |
+
<data type="string"/>
|
| 273 |
+
</attribute>
|
| 274 |
+
<attribute name="font_size">
|
| 275 |
+
<data type="float"/>
|
| 276 |
+
</attribute>
|
| 277 |
+
<ref name="GraphicState"/>
|
| 278 |
+
</element>
|
| 279 |
+
</define>
|
| 280 |
+
<define name="PDFParagraph">
|
| 281 |
+
<element name="pdfParagraph">
|
| 282 |
+
<optional>
|
| 283 |
+
<attribute name="xobjId">
|
| 284 |
+
<data type="int"/>
|
| 285 |
+
</attribute>
|
| 286 |
+
</optional>
|
| 287 |
+
<attribute name="unicode">
|
| 288 |
+
<data type="string"/>
|
| 289 |
+
</attribute>
|
| 290 |
+
<optional>
|
| 291 |
+
<attribute name="scale">
|
| 292 |
+
<data type="float"/>
|
| 293 |
+
</attribute>
|
| 294 |
+
</optional>
|
| 295 |
+
<optional>
|
| 296 |
+
<attribute name="vertical">
|
| 297 |
+
<data type="boolean"/>
|
| 298 |
+
</attribute>
|
| 299 |
+
</optional>
|
| 300 |
+
<optional>
|
| 301 |
+
<attribute name="FirstLineIndent">
|
| 302 |
+
<data type="boolean"/>
|
| 303 |
+
</attribute>
|
| 304 |
+
</optional>
|
| 305 |
+
<optional>
|
| 306 |
+
<attribute name="debug_id">
|
| 307 |
+
<data type="string"/>
|
| 308 |
+
</attribute>
|
| 309 |
+
</optional>
|
| 310 |
+
<ref name="Box"/>
|
| 311 |
+
<ref name="PDFStyle"/>
|
| 312 |
+
<zeroOrMore>
|
| 313 |
+
<ref name="PDFParagraphComposition"/>
|
| 314 |
+
</zeroOrMore>
|
| 315 |
+
</element>
|
| 316 |
+
</define>
|
| 317 |
+
<define name="PDFParagraphComposition">
|
| 318 |
+
<element name="pdfParagraphComposition">
|
| 319 |
+
<choice>
|
| 320 |
+
<ref name="PDFLine"/>
|
| 321 |
+
<ref name="PDFFormula"/>
|
| 322 |
+
<ref name="PDFSameStyleCharacters"/>
|
| 323 |
+
<ref name="PDFCharacter"/>
|
| 324 |
+
<ref name="PDFSameStyleUnicodeCharacters"/>
|
| 325 |
+
</choice>
|
| 326 |
+
</element>
|
| 327 |
+
</define>
|
| 328 |
+
<define name="PDFLine">
|
| 329 |
+
<element name="pdfLine">
|
| 330 |
+
<ref name="Box"/>
|
| 331 |
+
<oneOrMore>
|
| 332 |
+
<ref name="PDFCharacter"/>
|
| 333 |
+
</oneOrMore>
|
| 334 |
+
</element>
|
| 335 |
+
</define>
|
| 336 |
+
<define name="PDFSameStyleCharacters">
|
| 337 |
+
<element name="pdfSameStyleCharacters">
|
| 338 |
+
<ref name="Box"/>
|
| 339 |
+
<ref name="PDFStyle"/>
|
| 340 |
+
<oneOrMore>
|
| 341 |
+
<ref name="PDFCharacter"/>
|
| 342 |
+
</oneOrMore>
|
| 343 |
+
</element>
|
| 344 |
+
</define>
|
| 345 |
+
<define name="PDFSameStyleUnicodeCharacters">
|
| 346 |
+
<element name="pdfSameStyleUnicodeCharacters">
|
| 347 |
+
<optional>
|
| 348 |
+
<ref name="PDFStyle"/>
|
| 349 |
+
</optional>
|
| 350 |
+
<attribute name="unicode">
|
| 351 |
+
<data type="string"/>
|
| 352 |
+
</attribute>
|
| 353 |
+
<optional>
|
| 354 |
+
<attribute name="debug_info">
|
| 355 |
+
<data type="boolean"/>
|
| 356 |
+
</attribute>
|
| 357 |
+
</optional>
|
| 358 |
+
</element>
|
| 359 |
+
</define>
|
| 360 |
+
<define name="PDFFormula">
|
| 361 |
+
<element name="pdfFormula">
|
| 362 |
+
<ref name="Box"/>
|
| 363 |
+
<oneOrMore>
|
| 364 |
+
<ref name="PDFCharacter"/>
|
| 365 |
+
</oneOrMore>
|
| 366 |
+
<attribute name="x_offset">
|
| 367 |
+
<data type="float"/>
|
| 368 |
+
</attribute>
|
| 369 |
+
<attribute name="y_offset">
|
| 370 |
+
<data type="float"/>
|
| 371 |
+
</attribute>
|
| 372 |
+
</element>
|
| 373 |
+
</define>
|
| 374 |
+
<define name="PDFFigure">
|
| 375 |
+
<element name="pdfFigure">
|
| 376 |
+
<ref name="Box"/>
|
| 377 |
+
</element>
|
| 378 |
+
</define>
|
| 379 |
+
<define name="PDFRectangle">
|
| 380 |
+
<element name="pdfRectangle">
|
| 381 |
+
<ref name="Box"/>
|
| 382 |
+
<ref name="GraphicState"/>
|
| 383 |
+
<optional>
|
| 384 |
+
<attribute name="debug_info">
|
| 385 |
+
<data type="boolean"/>
|
| 386 |
+
</attribute>
|
| 387 |
+
</optional>
|
| 388 |
+
</element>
|
| 389 |
+
</define>
|
| 390 |
+
</grammar>
|
src/pdf2u/document_il/il_version_1.xsd
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
|
| 3 |
+
<xs:element name="document">
|
| 4 |
+
<xs:complexType>
|
| 5 |
+
<xs:sequence>
|
| 6 |
+
<xs:element maxOccurs="unbounded" ref="page"/>
|
| 7 |
+
</xs:sequence>
|
| 8 |
+
<xs:attribute name="totalPages" use="required" type="xs:int"/>
|
| 9 |
+
</xs:complexType>
|
| 10 |
+
</xs:element>
|
| 11 |
+
<xs:element name="page">
|
| 12 |
+
<xs:complexType>
|
| 13 |
+
<xs:sequence>
|
| 14 |
+
<xs:element ref="mediabox"/>
|
| 15 |
+
<xs:element ref="cropbox"/>
|
| 16 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
|
| 17 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
|
| 18 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
|
| 19 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
|
| 20 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
|
| 21 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
|
| 22 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
|
| 23 |
+
<xs:element ref="baseOperations"/>
|
| 24 |
+
</xs:sequence>
|
| 25 |
+
<xs:attribute name="pageNumber" use="required" type="xs:int"/>
|
| 26 |
+
<xs:attribute name="Unit" use="required" type="xs:string"/>
|
| 27 |
+
</xs:complexType>
|
| 28 |
+
</xs:element>
|
| 29 |
+
<xs:element name="mediabox">
|
| 30 |
+
<xs:complexType>
|
| 31 |
+
<xs:sequence>
|
| 32 |
+
<xs:element ref="box"/>
|
| 33 |
+
</xs:sequence>
|
| 34 |
+
</xs:complexType>
|
| 35 |
+
</xs:element>
|
| 36 |
+
<xs:element name="cropbox">
|
| 37 |
+
<xs:complexType>
|
| 38 |
+
<xs:sequence>
|
| 39 |
+
<xs:element ref="box"/>
|
| 40 |
+
</xs:sequence>
|
| 41 |
+
</xs:complexType>
|
| 42 |
+
</xs:element>
|
| 43 |
+
<xs:element name="baseOperations" type="xs:string"/>
|
| 44 |
+
<xs:element name="box">
|
| 45 |
+
<xs:complexType>
|
| 46 |
+
<xs:attribute name="x" use="required" type="xs:float"/>
|
| 47 |
+
<xs:attribute name="y" use="required" type="xs:float"/>
|
| 48 |
+
<xs:attribute name="x2" use="required" type="xs:float"/>
|
| 49 |
+
<xs:attribute name="y2" use="required" type="xs:float"/>
|
| 50 |
+
</xs:complexType>
|
| 51 |
+
</xs:element>
|
| 52 |
+
<xs:simpleType name="PDFXrefId">
|
| 53 |
+
<xs:restriction base="xs:int"/>
|
| 54 |
+
</xs:simpleType>
|
| 55 |
+
<xs:element name="pdfFont">
|
| 56 |
+
<xs:complexType>
|
| 57 |
+
<xs:attribute name="name" use="required" type="xs:string"/>
|
| 58 |
+
<xs:attribute name="fontId" use="required" type="xs:string"/>
|
| 59 |
+
<xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
|
| 60 |
+
<xs:attribute name="encodingLength" use="required" type="xs:int"/>
|
| 61 |
+
<xs:attribute name="bold" type="xs:boolean"/>
|
| 62 |
+
<xs:attribute name="italic" type="xs:boolean"/>
|
| 63 |
+
<xs:attribute name="monospace" type="xs:boolean"/>
|
| 64 |
+
<xs:attribute name="serif" type="xs:boolean"/>
|
| 65 |
+
<xs:attribute name="ascent" type="xs:float"/>
|
| 66 |
+
<xs:attribute name="descent" type="xs:float"/>
|
| 67 |
+
</xs:complexType>
|
| 68 |
+
</xs:element>
|
| 69 |
+
<xs:element name="pdfXobject">
|
| 70 |
+
<xs:complexType>
|
| 71 |
+
<xs:sequence>
|
| 72 |
+
<xs:element ref="box"/>
|
| 73 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
|
| 74 |
+
<xs:element ref="baseOperations"/>
|
| 75 |
+
</xs:sequence>
|
| 76 |
+
<xs:attribute name="xobjId" use="required" type="xs:int"/>
|
| 77 |
+
<xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
|
| 78 |
+
</xs:complexType>
|
| 79 |
+
</xs:element>
|
| 80 |
+
<xs:element name="pdfCharacter">
|
| 81 |
+
<xs:complexType>
|
| 82 |
+
<xs:sequence>
|
| 83 |
+
<xs:element ref="pdfStyle"/>
|
| 84 |
+
<xs:element ref="box"/>
|
| 85 |
+
</xs:sequence>
|
| 86 |
+
<xs:attribute name="vertical" type="xs:boolean"/>
|
| 87 |
+
<xs:attribute name="scale" type="xs:float"/>
|
| 88 |
+
<xs:attribute name="pdfCharacterId" type="xs:int"/>
|
| 89 |
+
<xs:attribute name="char_unicode" use="required" type="xs:string"/>
|
| 90 |
+
<xs:attribute name="advance" type="xs:float"/>
|
| 91 |
+
<xs:attribute name="xobjId" type="xs:int"/>
|
| 92 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
| 93 |
+
</xs:complexType>
|
| 94 |
+
</xs:element>
|
| 95 |
+
<xs:element name="pageLayout">
|
| 96 |
+
<xs:complexType>
|
| 97 |
+
<xs:sequence>
|
| 98 |
+
<xs:element ref="box"/>
|
| 99 |
+
</xs:sequence>
|
| 100 |
+
<xs:attribute name="id" use="required" type="xs:int"/>
|
| 101 |
+
<xs:attribute name="conf" use="required" type="xs:float"/>
|
| 102 |
+
<xs:attribute name="class_name" use="required" type="xs:string"/>
|
| 103 |
+
</xs:complexType>
|
| 104 |
+
</xs:element>
|
| 105 |
+
<xs:element name="graphicState">
|
| 106 |
+
<xs:complexType>
|
| 107 |
+
<xs:attribute name="linewidth" type="xs:float"/>
|
| 108 |
+
<xs:attribute name="dash">
|
| 109 |
+
<xs:simpleType>
|
| 110 |
+
<xs:restriction>
|
| 111 |
+
<xs:simpleType>
|
| 112 |
+
<xs:list itemType="xs:float"/>
|
| 113 |
+
</xs:simpleType>
|
| 114 |
+
<xs:minLength value="1"/>
|
| 115 |
+
</xs:restriction>
|
| 116 |
+
</xs:simpleType>
|
| 117 |
+
</xs:attribute>
|
| 118 |
+
<xs:attribute name="flatness" type="xs:float"/>
|
| 119 |
+
<xs:attribute name="intent" type="xs:string"/>
|
| 120 |
+
<xs:attribute name="linecap" type="xs:int"/>
|
| 121 |
+
<xs:attribute name="linejoin" type="xs:int"/>
|
| 122 |
+
<xs:attribute name="miterlimit" type="xs:float"/>
|
| 123 |
+
<xs:attribute name="ncolor">
|
| 124 |
+
<xs:simpleType>
|
| 125 |
+
<xs:restriction>
|
| 126 |
+
<xs:simpleType>
|
| 127 |
+
<xs:list itemType="xs:float"/>
|
| 128 |
+
</xs:simpleType>
|
| 129 |
+
<xs:minLength value="1"/>
|
| 130 |
+
</xs:restriction>
|
| 131 |
+
</xs:simpleType>
|
| 132 |
+
</xs:attribute>
|
| 133 |
+
<xs:attribute name="scolor">
|
| 134 |
+
<xs:simpleType>
|
| 135 |
+
<xs:restriction>
|
| 136 |
+
<xs:simpleType>
|
| 137 |
+
<xs:list itemType="xs:float"/>
|
| 138 |
+
</xs:simpleType>
|
| 139 |
+
<xs:minLength value="1"/>
|
| 140 |
+
</xs:restriction>
|
| 141 |
+
</xs:simpleType>
|
| 142 |
+
</xs:attribute>
|
| 143 |
+
<xs:attribute name="strokingColorSpaceName" type="xs:string"/>
|
| 144 |
+
<xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
|
| 145 |
+
<xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
|
| 146 |
+
</xs:complexType>
|
| 147 |
+
</xs:element>
|
| 148 |
+
<xs:element name="pdfStyle">
|
| 149 |
+
<xs:complexType>
|
| 150 |
+
<xs:sequence>
|
| 151 |
+
<xs:element ref="graphicState"/>
|
| 152 |
+
</xs:sequence>
|
| 153 |
+
<xs:attribute name="font_id" use="required" type="xs:string"/>
|
| 154 |
+
<xs:attribute name="font_size" use="required" type="xs:float"/>
|
| 155 |
+
</xs:complexType>
|
| 156 |
+
</xs:element>
|
| 157 |
+
<xs:element name="pdfParagraph">
|
| 158 |
+
<xs:complexType>
|
| 159 |
+
<xs:sequence>
|
| 160 |
+
<xs:element ref="box"/>
|
| 161 |
+
<xs:element ref="pdfStyle"/>
|
| 162 |
+
<xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
|
| 163 |
+
</xs:sequence>
|
| 164 |
+
<xs:attribute name="xobjId" type="xs:int"/>
|
| 165 |
+
<xs:attribute name="unicode" use="required" type="xs:string"/>
|
| 166 |
+
<xs:attribute name="scale" type="xs:float"/>
|
| 167 |
+
<xs:attribute name="vertical" type="xs:boolean"/>
|
| 168 |
+
<xs:attribute name="FirstLineIndent" type="xs:boolean"/>
|
| 169 |
+
<xs:attribute name="debug_id" type="xs:string"/>
|
| 170 |
+
</xs:complexType>
|
| 171 |
+
</xs:element>
|
| 172 |
+
<xs:element name="pdfParagraphComposition">
|
| 173 |
+
<xs:complexType>
|
| 174 |
+
<xs:choice>
|
| 175 |
+
<xs:element ref="pdfLine"/>
|
| 176 |
+
<xs:element ref="pdfFormula"/>
|
| 177 |
+
<xs:element ref="pdfSameStyleCharacters"/>
|
| 178 |
+
<xs:element ref="pdfCharacter"/>
|
| 179 |
+
<xs:element ref="pdfSameStyleUnicodeCharacters"/>
|
| 180 |
+
</xs:choice>
|
| 181 |
+
</xs:complexType>
|
| 182 |
+
</xs:element>
|
| 183 |
+
<xs:element name="pdfLine">
|
| 184 |
+
<xs:complexType>
|
| 185 |
+
<xs:sequence>
|
| 186 |
+
<xs:element ref="box"/>
|
| 187 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
| 188 |
+
</xs:sequence>
|
| 189 |
+
</xs:complexType>
|
| 190 |
+
</xs:element>
|
| 191 |
+
<xs:element name="pdfSameStyleCharacters">
|
| 192 |
+
<xs:complexType>
|
| 193 |
+
<xs:sequence>
|
| 194 |
+
<xs:element ref="box"/>
|
| 195 |
+
<xs:element ref="pdfStyle"/>
|
| 196 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
| 197 |
+
</xs:sequence>
|
| 198 |
+
</xs:complexType>
|
| 199 |
+
</xs:element>
|
| 200 |
+
<xs:element name="pdfSameStyleUnicodeCharacters">
|
| 201 |
+
<xs:complexType>
|
| 202 |
+
<xs:sequence>
|
| 203 |
+
<xs:element minOccurs="0" ref="pdfStyle"/>
|
| 204 |
+
</xs:sequence>
|
| 205 |
+
<xs:attribute name="unicode" use="required" type="xs:string"/>
|
| 206 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
| 207 |
+
</xs:complexType>
|
| 208 |
+
</xs:element>
|
| 209 |
+
<xs:element name="pdfFormula">
|
| 210 |
+
<xs:complexType>
|
| 211 |
+
<xs:sequence>
|
| 212 |
+
<xs:element ref="box"/>
|
| 213 |
+
<xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
|
| 214 |
+
</xs:sequence>
|
| 215 |
+
<xs:attribute name="x_offset" use="required" type="xs:float"/>
|
| 216 |
+
<xs:attribute name="y_offset" use="required" type="xs:float"/>
|
| 217 |
+
</xs:complexType>
|
| 218 |
+
</xs:element>
|
| 219 |
+
<xs:element name="pdfFigure">
|
| 220 |
+
<xs:complexType>
|
| 221 |
+
<xs:sequence>
|
| 222 |
+
<xs:element ref="box"/>
|
| 223 |
+
</xs:sequence>
|
| 224 |
+
</xs:complexType>
|
| 225 |
+
</xs:element>
|
| 226 |
+
<xs:element name="pdfRectangle">
|
| 227 |
+
<xs:complexType>
|
| 228 |
+
<xs:sequence>
|
| 229 |
+
<xs:element ref="box"/>
|
| 230 |
+
<xs:element ref="graphicState"/>
|
| 231 |
+
</xs:sequence>
|
| 232 |
+
<xs:attribute name="debug_info" type="xs:boolean"/>
|
| 233 |
+
</xs:complexType>
|
| 234 |
+
</xs:element>
|
| 235 |
+
</xs:schema>
|
src/pdf2u/document_il/midend/__init__.py
ADDED
|
File without changes
|
src/pdf2u/document_il/midend/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (180 Bytes). View file
|
|
|