Merge pull request #877 from datalab-to/dev
Browse files- marker/converters/pdf.py +3 -1
- marker/models.py +13 -5
- marker/schema/blocks/base.py +11 -0
- poetry.lock +82 -55
- pyproject.toml +2 -2
marker/converters/pdf.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
|
|
|
|
|
|
|
| 3 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 4 |
|
| 5 |
from collections import defaultdict
|
|
@@ -171,7 +173,7 @@ class PdfConverter(BaseConverter):
|
|
| 171 |
if temp_file is not None and os.path.exists(temp_file.name):
|
| 172 |
os.unlink(temp_file.name)
|
| 173 |
|
| 174 |
-
def build_document(self, filepath: str):
|
| 175 |
provider_cls = provider_from_filepath(filepath)
|
| 176 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 177 |
line_builder = self.resolve_dependencies(LineBuilder)
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
from marker.schema.document import Document
|
| 4 |
+
|
| 5 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
|
| 6 |
|
| 7 |
from collections import defaultdict
|
|
|
|
| 173 |
if temp_file is not None and os.path.exists(temp_file.name):
|
| 174 |
os.unlink(temp_file.name)
|
| 175 |
|
| 176 |
+
def build_document(self, filepath: str) -> Document:
|
| 177 |
provider_cls = provider_from_filepath(filepath)
|
| 178 |
layout_builder = self.resolve_dependencies(self.layout_builder_class)
|
| 179 |
line_builder = self.resolve_dependencies(LineBuilder)
|
marker/models.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
| 1 |
import os
|
| 2 |
-
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
from surya.foundation import FoundationPredictor
|
| 5 |
from surya.detection import DetectionPredictor
|
|
@@ -8,13 +11,18 @@ from surya.ocr_error import OCRErrorPredictor
|
|
| 8 |
from surya.recognition import RecognitionPredictor
|
| 9 |
from surya.table_rec import TableRecPredictor
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
return {
|
| 14 |
"foundation_model": foundation_predictor,
|
| 15 |
"layout_model": LayoutPredictor(device=device, dtype=dtype),
|
| 16 |
"recognition_model": RecognitionPredictor(foundation_predictor),
|
| 17 |
"table_rec_model": TableRecPredictor(device=device, dtype=dtype),
|
| 18 |
"detection_model": DetectionPredictor(device=device, dtype=dtype),
|
| 19 |
-
"ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype)
|
| 20 |
-
}
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
|
| 4 |
+
"1" # Transformers uses .isin for an op, which is not supported on MPS
|
| 5 |
+
)
|
| 6 |
|
| 7 |
from surya.foundation import FoundationPredictor
|
| 8 |
from surya.detection import DetectionPredictor
|
|
|
|
| 11 |
from surya.recognition import RecognitionPredictor
|
| 12 |
from surya.table_rec import TableRecPredictor
|
| 13 |
|
| 14 |
+
|
| 15 |
+
def create_model_dict(
|
| 16 |
+
device=None, dtype=None, attention_implementation: str | None = None
|
| 17 |
+
) -> dict:
|
| 18 |
+
foundation_predictor = FoundationPredictor(
|
| 19 |
+
device=device, dtype=dtype, attention_implementation=attention_implementation
|
| 20 |
+
)
|
| 21 |
return {
|
| 22 |
"foundation_model": foundation_predictor,
|
| 23 |
"layout_model": LayoutPredictor(device=device, dtype=dtype),
|
| 24 |
"recognition_model": RecognitionPredictor(foundation_predictor),
|
| 25 |
"table_rec_model": TableRecPredictor(device=device, dtype=dtype),
|
| 26 |
"detection_model": DetectionPredictor(device=device, dtype=dtype),
|
| 27 |
+
"ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
|
| 28 |
+
}
|
marker/schema/blocks/base.py
CHANGED
|
@@ -100,6 +100,7 @@ class Block(BaseModel):
|
|
| 100 |
lowres_image: Image.Image | None = None
|
| 101 |
highres_image: Image.Image | None = None
|
| 102 |
removed: bool = False # Has block been replaced by new block?
|
|
|
|
| 103 |
|
| 104 |
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 105 |
|
|
@@ -114,6 +115,16 @@ class Block(BaseModel):
|
|
| 114 |
block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
|
| 115 |
return cls(**block_attrs)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
def get_image(
|
| 118 |
self,
|
| 119 |
document: Document,
|
|
|
|
| 100 |
lowres_image: Image.Image | None = None
|
| 101 |
highres_image: Image.Image | None = None
|
| 102 |
removed: bool = False # Has block been replaced by new block?
|
| 103 |
+
_metadata: Optional[dict] = None
|
| 104 |
|
| 105 |
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 106 |
|
|
|
|
| 115 |
block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
|
| 116 |
return cls(**block_attrs)
|
| 117 |
|
| 118 |
+
def set_internal_metadata(self, key, data):
|
| 119 |
+
if self._metadata is None:
|
| 120 |
+
self._metadata = {}
|
| 121 |
+
self._metadata[key] = data
|
| 122 |
+
|
| 123 |
+
def get_internal_metadata(self, key):
|
| 124 |
+
if self._metadata is None:
|
| 125 |
+
return None
|
| 126 |
+
return self._metadata.get(key)
|
| 127 |
+
|
| 128 |
def get_image(
|
| 129 |
self,
|
| 130 |
document: Document,
|
poetry.lock
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# This file is automatically @generated by Poetry 2.1.
|
| 2 |
|
| 3 |
[[package]]
|
| 4 |
name = "aiohappyeyeballs"
|
|
@@ -1652,6 +1652,28 @@ files = [
|
|
| 1652 |
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
|
| 1653 |
]
|
| 1654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1655 |
[[package]]
|
| 1656 |
name = "httpcore"
|
| 1657 |
version = "1.0.7"
|
|
@@ -1701,19 +1723,20 @@ zstd = ["zstandard (>=0.18.0)"]
|
|
| 1701 |
|
| 1702 |
[[package]]
|
| 1703 |
name = "huggingface-hub"
|
| 1704 |
-
version = "0.
|
| 1705 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
| 1706 |
optional = false
|
| 1707 |
python-versions = ">=3.8.0"
|
| 1708 |
groups = ["main", "dev"]
|
| 1709 |
files = [
|
| 1710 |
-
{file = "huggingface_hub-0.
|
| 1711 |
-
{file = "huggingface_hub-0.
|
| 1712 |
]
|
| 1713 |
|
| 1714 |
[package.dependencies]
|
| 1715 |
filelock = "*"
|
| 1716 |
fsspec = ">=2023.5.0"
|
|
|
|
| 1717 |
packaging = ">=20.9"
|
| 1718 |
pyyaml = ">=5.1"
|
| 1719 |
requests = "*"
|
|
@@ -1721,17 +1744,19 @@ tqdm = ">=4.42.1"
|
|
| 1721 |
typing-extensions = ">=3.7.4.3"
|
| 1722 |
|
| 1723 |
[package.extras]
|
| 1724 |
-
all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (
|
| 1725 |
cli = ["InquirerPy (==0.3.4)"]
|
| 1726 |
-
dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (
|
| 1727 |
fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
|
| 1728 |
hf-transfer = ["hf-transfer (>=0.1.4)"]
|
| 1729 |
-
hf-xet = ["hf-xet (>=
|
| 1730 |
inference = ["aiohttp"]
|
| 1731 |
-
|
|
|
|
|
|
|
| 1732 |
tensorflow = ["graphviz", "pydot", "tensorflow"]
|
| 1733 |
tensorflow-testing = ["keras (<3.0)", "tensorflow"]
|
| 1734 |
-
testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
|
| 1735 |
torch = ["safetensors[torch]", "torch"]
|
| 1736 |
typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
|
| 1737 |
|
|
@@ -5390,21 +5415,21 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
|
|
| 5390 |
|
| 5391 |
[[package]]
|
| 5392 |
name = "surya-ocr"
|
| 5393 |
-
version = "0.16.
|
| 5394 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 5395 |
optional = false
|
| 5396 |
python-versions = "<4.0,>=3.10"
|
| 5397 |
groups = ["main"]
|
| 5398 |
files = [
|
| 5399 |
-
{file = "surya_ocr-0.16.
|
| 5400 |
-
{file = "surya_ocr-0.16.
|
| 5401 |
]
|
| 5402 |
|
| 5403 |
[package.dependencies]
|
| 5404 |
click = ">=8.1.8,<9.0.0"
|
| 5405 |
einops = ">=0.8.1,<0.9.0"
|
| 5406 |
filetype = ">=1.2.0,<2.0.0"
|
| 5407 |
-
opencv-python-headless = "
|
| 5408 |
pillow = ">=10.2.0,<11.0.0"
|
| 5409 |
platformdirs = ">=4.3.6,<5.0.0"
|
| 5410 |
pre-commit = ">=4.2.0,<5.0.0"
|
|
@@ -5413,7 +5438,7 @@ pydantic-settings = ">=2.1.0,<3.0.0"
|
|
| 5413 |
pypdfium2 = "4.30.0"
|
| 5414 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 5415 |
torch = ">=2.7.0,<3.0.0"
|
| 5416 |
-
transformers = ">=4.
|
| 5417 |
|
| 5418 |
[[package]]
|
| 5419 |
name = "sympy"
|
|
@@ -5540,27 +5565,27 @@ test = ["pytest", "ruff"]
|
|
| 5540 |
|
| 5541 |
[[package]]
|
| 5542 |
name = "tokenizers"
|
| 5543 |
-
version = "0.
|
| 5544 |
description = ""
|
| 5545 |
optional = false
|
| 5546 |
python-versions = ">=3.9"
|
| 5547 |
groups = ["main"]
|
| 5548 |
files = [
|
| 5549 |
-
{file = "tokenizers-0.
|
| 5550 |
-
{file = "tokenizers-0.
|
| 5551 |
-
{file = "tokenizers-0.
|
| 5552 |
-
{file = "tokenizers-0.
|
| 5553 |
-
{file = "tokenizers-0.
|
| 5554 |
-
{file = "tokenizers-0.
|
| 5555 |
-
{file = "tokenizers-0.
|
| 5556 |
-
{file = "tokenizers-0.
|
| 5557 |
-
{file = "tokenizers-0.
|
| 5558 |
-
{file = "tokenizers-0.
|
| 5559 |
-
{file = "tokenizers-0.
|
| 5560 |
-
{file = "tokenizers-0.
|
| 5561 |
-
{file = "tokenizers-0.
|
| 5562 |
-
{file = "tokenizers-0.
|
| 5563 |
-
{file = "tokenizers-0.
|
| 5564 |
]
|
| 5565 |
|
| 5566 |
[package.dependencies]
|
|
@@ -5569,7 +5594,7 @@ huggingface-hub = ">=0.16.4,<1.0"
|
|
| 5569 |
[package.extras]
|
| 5570 |
dev = ["tokenizers[testing]"]
|
| 5571 |
docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
|
| 5572 |
-
testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
|
| 5573 |
|
| 5574 |
[[package]]
|
| 5575 |
name = "toml"
|
|
@@ -5749,74 +5774,76 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
|
|
| 5749 |
|
| 5750 |
[[package]]
|
| 5751 |
name = "transformers"
|
| 5752 |
-
version = "4.
|
| 5753 |
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
|
| 5754 |
optional = false
|
| 5755 |
python-versions = ">=3.9.0"
|
| 5756 |
groups = ["main"]
|
| 5757 |
files = [
|
| 5758 |
-
{file = "transformers-4.
|
| 5759 |
-
{file = "transformers-4.
|
| 5760 |
]
|
| 5761 |
|
| 5762 |
[package.dependencies]
|
| 5763 |
filelock = "*"
|
| 5764 |
-
huggingface-hub = ">=0.
|
| 5765 |
numpy = ">=1.17"
|
| 5766 |
packaging = ">=20.0"
|
| 5767 |
pyyaml = ">=5.1"
|
| 5768 |
regex = "!=2019.12.17"
|
| 5769 |
requests = "*"
|
| 5770 |
safetensors = ">=0.4.3"
|
| 5771 |
-
tokenizers = ">=0.
|
| 5772 |
tqdm = ">=4.27"
|
| 5773 |
|
| 5774 |
[package.extras]
|
| 5775 |
accelerate = ["accelerate (>=0.26.0)"]
|
| 5776 |
-
|
| 5777 |
-
all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.3.2,<0.4)", "librosa", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
|
| 5778 |
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5779 |
benchmark = ["optimum-benchmark (>=0.3.0)"]
|
|
|
|
| 5780 |
codecarbon = ["codecarbon (>=2.8.1)"]
|
| 5781 |
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
|
| 5782 |
-
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (
|
| 5783 |
-
dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (
|
| 5784 |
-
dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (
|
| 5785 |
-
dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (
|
| 5786 |
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
|
| 5787 |
flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5788 |
ftfy = ["ftfy"]
|
| 5789 |
-
hf-xet = ["
|
| 5790 |
-
hub-kernels = ["kernels (>=0.
|
| 5791 |
-
integrations = ["kernels (>=0.
|
| 5792 |
-
ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "
|
|
|
|
| 5793 |
modelcreation = ["cookiecutter (==1.7.3)"]
|
| 5794 |
natten = ["natten (>=0.14.6,<0.15.0)"]
|
| 5795 |
num2words = ["num2words"]
|
| 5796 |
onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
|
| 5797 |
onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
|
|
|
|
| 5798 |
optuna = ["optuna"]
|
| 5799 |
-
quality = ["GitPython (<3.1.19)", "datasets (
|
| 5800 |
ray = ["ray[tune] (>=2.7.0)"]
|
| 5801 |
-
retrieval = ["datasets (
|
| 5802 |
ruff = ["ruff (==0.11.2)"]
|
| 5803 |
sagemaker = ["sagemaker (>=2.31.0)"]
|
| 5804 |
sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
|
| 5805 |
-
serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
|
| 5806 |
sigopt = ["sigopt"]
|
| 5807 |
sklearn = ["scikit-learn"]
|
| 5808 |
speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
|
| 5809 |
-
testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (
|
| 5810 |
tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
|
| 5811 |
tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
|
| 5812 |
tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5813 |
tiktoken = ["blobfile", "tiktoken"]
|
| 5814 |
-
timm = ["timm (
|
| 5815 |
-
tokenizers = ["tokenizers (>=0.
|
| 5816 |
-
torch = ["accelerate (>=0.26.0)", "torch (>=2.
|
| 5817 |
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
|
| 5818 |
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
|
| 5819 |
-
torchhub = ["filelock", "huggingface-hub (>=0.
|
| 5820 |
video = ["av"]
|
| 5821 |
vision = ["Pillow (>=10.0.1,<=15.0)"]
|
| 5822 |
|
|
@@ -6505,4 +6532,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
|
|
| 6505 |
[metadata]
|
| 6506 |
lock-version = "2.1"
|
| 6507 |
python-versions = "^3.10"
|
| 6508 |
-
content-hash = "
|
|
|
|
| 1 |
+
# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
|
| 2 |
|
| 3 |
[[package]]
|
| 4 |
name = "aiohappyeyeballs"
|
|
|
|
| 1652 |
{file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
|
| 1653 |
]
|
| 1654 |
|
| 1655 |
+
[[package]]
|
| 1656 |
+
name = "hf-xet"
|
| 1657 |
+
version = "1.1.9"
|
| 1658 |
+
description = "Fast transfer of large files with the Hugging Face Hub."
|
| 1659 |
+
optional = false
|
| 1660 |
+
python-versions = ">=3.8"
|
| 1661 |
+
groups = ["main", "dev"]
|
| 1662 |
+
markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""
|
| 1663 |
+
files = [
|
| 1664 |
+
{file = "hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160"},
|
| 1665 |
+
{file = "hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a"},
|
| 1666 |
+
{file = "hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c"},
|
| 1667 |
+
{file = "hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790"},
|
| 1668 |
+
{file = "hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95"},
|
| 1669 |
+
{file = "hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea"},
|
| 1670 |
+
{file = "hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127"},
|
| 1671 |
+
{file = "hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803"},
|
| 1672 |
+
]
|
| 1673 |
+
|
| 1674 |
+
[package.extras]
|
| 1675 |
+
tests = ["pytest"]
|
| 1676 |
+
|
| 1677 |
[[package]]
|
| 1678 |
name = "httpcore"
|
| 1679 |
version = "1.0.7"
|
|
|
|
| 1723 |
|
| 1724 |
[[package]]
|
| 1725 |
name = "huggingface-hub"
|
| 1726 |
+
version = "0.34.4"
|
| 1727 |
description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
|
| 1728 |
optional = false
|
| 1729 |
python-versions = ">=3.8.0"
|
| 1730 |
groups = ["main", "dev"]
|
| 1731 |
files = [
|
| 1732 |
+
{file = "huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a"},
|
| 1733 |
+
{file = "huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c"},
|
| 1734 |
]
|
| 1735 |
|
| 1736 |
[package.dependencies]
|
| 1737 |
filelock = "*"
|
| 1738 |
fsspec = ">=2023.5.0"
|
| 1739 |
+
hf-xet = {version = ">=1.1.3,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""}
|
| 1740 |
packaging = ">=20.9"
|
| 1741 |
pyyaml = ">=5.1"
|
| 1742 |
requests = "*"
|
|
|
|
| 1744 |
typing-extensions = ">=3.7.4.3"
|
| 1745 |
|
| 1746 |
[package.extras]
|
| 1747 |
+
all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
|
| 1748 |
cli = ["InquirerPy (==0.3.4)"]
|
| 1749 |
+
dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
|
| 1750 |
fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
|
| 1751 |
hf-transfer = ["hf-transfer (>=0.1.4)"]
|
| 1752 |
+
hf-xet = ["hf-xet (>=1.1.2,<2.0.0)"]
|
| 1753 |
inference = ["aiohttp"]
|
| 1754 |
+
mcp = ["aiohttp", "mcp (>=1.8.0)", "typer"]
|
| 1755 |
+
oauth = ["authlib (>=1.3.2)", "fastapi", "httpx", "itsdangerous"]
|
| 1756 |
+
quality = ["libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "ruff (>=0.9.0)"]
|
| 1757 |
tensorflow = ["graphviz", "pydot", "tensorflow"]
|
| 1758 |
tensorflow-testing = ["keras (<3.0)", "tensorflow"]
|
| 1759 |
+
testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
|
| 1760 |
torch = ["safetensors[torch]", "torch"]
|
| 1761 |
typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
|
| 1762 |
|
|
|
|
| 5415 |
|
| 5416 |
[[package]]
|
| 5417 |
name = "surya-ocr"
|
| 5418 |
+
version = "0.16.7"
|
| 5419 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 5420 |
optional = false
|
| 5421 |
python-versions = "<4.0,>=3.10"
|
| 5422 |
groups = ["main"]
|
| 5423 |
files = [
|
| 5424 |
+
{file = "surya_ocr-0.16.7-py3-none-any.whl", hash = "sha256:20bf5f7f22832a15bee6a0b65343b439b0317877da997f49ce4ea8d7f595647f"},
|
| 5425 |
+
{file = "surya_ocr-0.16.7.tar.gz", hash = "sha256:bd3561acbbac9fb02069bedae5ec9fda1d7f868b17fd2a59447f79f61893a7cd"},
|
| 5426 |
]
|
| 5427 |
|
| 5428 |
[package.dependencies]
|
| 5429 |
click = ">=8.1.8,<9.0.0"
|
| 5430 |
einops = ">=0.8.1,<0.9.0"
|
| 5431 |
filetype = ">=1.2.0,<2.0.0"
|
| 5432 |
+
opencv-python-headless = "4.11.0.86"
|
| 5433 |
pillow = ">=10.2.0,<11.0.0"
|
| 5434 |
platformdirs = ">=4.3.6,<5.0.0"
|
| 5435 |
pre-commit = ">=4.2.0,<5.0.0"
|
|
|
|
| 5438 |
pypdfium2 = "4.30.0"
|
| 5439 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 5440 |
torch = ">=2.7.0,<3.0.0"
|
| 5441 |
+
transformers = ">=4.56.1"
|
| 5442 |
|
| 5443 |
[[package]]
|
| 5444 |
name = "sympy"
|
|
|
|
| 5565 |
|
| 5566 |
[[package]]
|
| 5567 |
name = "tokenizers"
|
| 5568 |
+
version = "0.22.0"
|
| 5569 |
description = ""
|
| 5570 |
optional = false
|
| 5571 |
python-versions = ">=3.9"
|
| 5572 |
groups = ["main"]
|
| 5573 |
files = [
|
| 5574 |
+
{file = "tokenizers-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:eaa9620122a3fb99b943f864af95ed14c8dfc0f47afa3b404ac8c16b3f2bb484"},
|
| 5575 |
+
{file = "tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:71784b9ab5bf0ff3075bceeb198149d2c5e068549c0d18fe32d06ba0deb63f79"},
|
| 5576 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec5b71f668a8076802b0241a42387d48289f25435b86b769ae1837cad4172a17"},
|
| 5577 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea8562fa7498850d02a16178105b58803ea825b50dc9094d60549a7ed63654bb"},
|
| 5578 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4136e1558a9ef2e2f1de1555dcd573e1cbc4a320c1a06c4107a3d46dc8ac6e4b"},
|
| 5579 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf5954de3962a5fd9781dc12048d24a1a6f1f5df038c6e95db328cd22964206"},
|
| 5580 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8337ca75d0731fc4860e6204cc24bb36a67d9736142aa06ed320943b50b1e7ed"},
|
| 5581 |
+
{file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a89264e26f63c449d8cded9061adea7b5de53ba2346fc7e87311f7e4117c1cc8"},
|
| 5582 |
+
{file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:790bad50a1b59d4c21592f9c3cf5e5cf9c3c7ce7e1a23a739f13e01fb1be377a"},
|
| 5583 |
+
{file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:76cf6757c73a10ef10bf06fa937c0ec7393d90432f543f49adc8cab3fb6f26cb"},
|
| 5584 |
+
{file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:1626cb186e143720c62c6c6b5371e62bbc10af60481388c0da89bc903f37ea0c"},
|
| 5585 |
+
{file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:da589a61cbfea18ae267723d6b029b84598dc8ca78db9951d8f5beff72d8507c"},
|
| 5586 |
+
{file = "tokenizers-0.22.0-cp39-abi3-win32.whl", hash = "sha256:dbf9d6851bddae3e046fedfb166f47743c1c7bd11c640f0691dd35ef0bcad3be"},
|
| 5587 |
+
{file = "tokenizers-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:c78174859eeaee96021f248a56c801e36bfb6bd5b067f2e95aa82445ca324f00"},
|
| 5588 |
+
{file = "tokenizers-0.22.0.tar.gz", hash = "sha256:2e33b98525be8453f355927f3cab312c36cd3e44f4d7e9e97da2fa94d0a49dcb"},
|
| 5589 |
]
|
| 5590 |
|
| 5591 |
[package.dependencies]
|
|
|
|
| 5594 |
[package.extras]
|
| 5595 |
dev = ["tokenizers[testing]"]
|
| 5596 |
docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
|
| 5597 |
+
testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff"]
|
| 5598 |
|
| 5599 |
[[package]]
|
| 5600 |
name = "toml"
|
|
|
|
| 5774 |
|
| 5775 |
[[package]]
|
| 5776 |
name = "transformers"
|
| 5777 |
+
version = "4.56.1"
|
| 5778 |
description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
|
| 5779 |
optional = false
|
| 5780 |
python-versions = ">=3.9.0"
|
| 5781 |
groups = ["main"]
|
| 5782 |
files = [
|
| 5783 |
+
{file = "transformers-4.56.1-py3-none-any.whl", hash = "sha256:1697af6addfb6ddbce9618b763f4b52d5a756f6da4899ffd1b4febf58b779248"},
|
| 5784 |
+
{file = "transformers-4.56.1.tar.gz", hash = "sha256:0d88b1089a563996fc5f2c34502f10516cad3ea1aa89f179f522b54c8311fe74"},
|
| 5785 |
]
|
| 5786 |
|
| 5787 |
[package.dependencies]
|
| 5788 |
filelock = "*"
|
| 5789 |
+
huggingface-hub = ">=0.34.0,<1.0"
|
| 5790 |
numpy = ">=1.17"
|
| 5791 |
packaging = ">=20.0"
|
| 5792 |
pyyaml = ">=5.1"
|
| 5793 |
regex = "!=2019.12.17"
|
| 5794 |
requests = "*"
|
| 5795 |
safetensors = ">=0.4.3"
|
| 5796 |
+
tokenizers = ">=0.22.0,<=0.23.0"
|
| 5797 |
tqdm = ">=4.27"
|
| 5798 |
|
| 5799 |
[package.extras]
|
| 5800 |
accelerate = ["accelerate (>=0.26.0)"]
|
| 5801 |
+
all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"]
|
|
|
|
| 5802 |
audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5803 |
benchmark = ["optimum-benchmark (>=0.3.0)"]
|
| 5804 |
+
chat-template = ["jinja2 (>=3.1.0)"]
|
| 5805 |
codecarbon = ["codecarbon (>=2.8.1)"]
|
| 5806 |
deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
|
| 5807 |
+
deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
|
| 5808 |
+
dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
|
| 5809 |
+
dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"]
|
| 5810 |
+
dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
|
| 5811 |
flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
|
| 5812 |
flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5813 |
ftfy = ["ftfy"]
|
| 5814 |
+
hf-xet = ["hf_xet"]
|
| 5815 |
+
hub-kernels = ["kernels (>=0.6.1,<=0.9)"]
|
| 5816 |
+
integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"]
|
| 5817 |
+
ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"]
|
| 5818 |
+
mistral-common = ["mistral-common[opencv] (>=1.6.3)"]
|
| 5819 |
modelcreation = ["cookiecutter (==1.7.3)"]
|
| 5820 |
natten = ["natten (>=0.14.6,<0.15.0)"]
|
| 5821 |
num2words = ["num2words"]
|
| 5822 |
onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
|
| 5823 |
onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
|
| 5824 |
+
open-telemetry = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"]
|
| 5825 |
optuna = ["optuna"]
|
| 5826 |
+
quality = ["GitPython (<3.1.19)", "datasets (>=2.15.0)", "libcst", "pandas (<2.3.0)", "rich", "ruff (==0.11.2)", "urllib3 (<2.0.0)"]
|
| 5827 |
ray = ["ray[tune] (>=2.7.0)"]
|
| 5828 |
+
retrieval = ["datasets (>=2.15.0)", "faiss-cpu"]
|
| 5829 |
ruff = ["ruff (==0.11.2)"]
|
| 5830 |
sagemaker = ["sagemaker (>=2.31.0)"]
|
| 5831 |
sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
|
| 5832 |
+
serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "starlette", "torch (>=2.2)", "uvicorn"]
|
| 5833 |
sigopt = ["sigopt"]
|
| 5834 |
sklearn = ["scikit-learn"]
|
| 5835 |
speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
|
| 5836 |
+
testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
|
| 5837 |
tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
|
| 5838 |
tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
|
| 5839 |
tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
|
| 5840 |
tiktoken = ["blobfile", "tiktoken"]
|
| 5841 |
+
timm = ["timm (!=1.0.18,<=1.0.19)"]
|
| 5842 |
+
tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"]
|
| 5843 |
+
torch = ["accelerate (>=0.26.0)", "torch (>=2.2)"]
|
| 5844 |
torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
|
| 5845 |
torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
|
| 5846 |
+
torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"]
|
| 5847 |
video = ["av"]
|
| 5848 |
vision = ["Pillow (>=10.0.1,<=15.0)"]
|
| 5849 |
|
|
|
|
| 6532 |
[metadata]
|
| 6533 |
lock-version = "2.1"
|
| 6534 |
python-versions = "^3.10"
|
| 6535 |
+
content-hash = "ca3704b625f021f3b09ad23df31ad3b48de68b2bf0df0fc144babcb6f5010767"
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "1.9.
|
| 4 |
description = "Convert documents to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
@@ -26,7 +26,7 @@ torch = "^2.7.0"
|
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
-
surya-ocr = "^0.16.
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.3"
|
| 32 |
markdownify = "^1.1.0"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "1.9.3"
|
| 4 |
description = "Convert documents to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 26 |
tqdm = "^4.66.1"
|
| 27 |
ftfy = "^6.1.1"
|
| 28 |
rapidfuzz = "^3.8.1"
|
| 29 |
+
surya-ocr = "^0.16.7"
|
| 30 |
regex = "^2024.4.28"
|
| 31 |
pdftext = "~0.6.3"
|
| 32 |
markdownify = "^1.1.0"
|