Vik Paruchuri commited on
Commit
7e7ea81
·
2 Parent(s): 8adb727 906130f

Merge pull request #877 from datalab-to/dev

Browse files
marker/converters/pdf.py CHANGED
@@ -1,5 +1,7 @@
1
  import os
2
 
 
 
3
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
4
 
5
  from collections import defaultdict
@@ -171,7 +173,7 @@ class PdfConverter(BaseConverter):
171
  if temp_file is not None and os.path.exists(temp_file.name):
172
  os.unlink(temp_file.name)
173
 
174
- def build_document(self, filepath: str):
175
  provider_cls = provider_from_filepath(filepath)
176
  layout_builder = self.resolve_dependencies(self.layout_builder_class)
177
  line_builder = self.resolve_dependencies(LineBuilder)
 
1
  import os
2
 
3
+ from marker.schema.document import Document
4
+
5
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # disables a tokenizers warning
6
 
7
  from collections import defaultdict
 
173
  if temp_file is not None and os.path.exists(temp_file.name):
174
  os.unlink(temp_file.name)
175
 
176
+ def build_document(self, filepath: str) -> Document:
177
  provider_cls = provider_from_filepath(filepath)
178
  layout_builder = self.resolve_dependencies(self.layout_builder_class)
179
  line_builder = self.resolve_dependencies(LineBuilder)
marker/models.py CHANGED
@@ -1,5 +1,8 @@
1
  import os
2
- os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
 
 
 
3
 
4
  from surya.foundation import FoundationPredictor
5
  from surya.detection import DetectionPredictor
@@ -8,13 +11,18 @@ from surya.ocr_error import OCRErrorPredictor
8
  from surya.recognition import RecognitionPredictor
9
  from surya.table_rec import TableRecPredictor
10
 
11
- def create_model_dict(device=None, dtype=None) -> dict:
12
- foundation_predictor = FoundationPredictor(device=device, dtype=dtype)
 
 
 
 
 
13
  return {
14
  "foundation_model": foundation_predictor,
15
  "layout_model": LayoutPredictor(device=device, dtype=dtype),
16
  "recognition_model": RecognitionPredictor(foundation_predictor),
17
  "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
18
  "detection_model": DetectionPredictor(device=device, dtype=dtype),
19
- "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype)
20
- }
 
1
  import os
2
+
3
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = (
4
+ "1" # Transformers uses .isin for an op, which is not supported on MPS
5
+ )
6
 
7
  from surya.foundation import FoundationPredictor
8
  from surya.detection import DetectionPredictor
 
11
  from surya.recognition import RecognitionPredictor
12
  from surya.table_rec import TableRecPredictor
13
 
14
+
15
+ def create_model_dict(
16
+ device=None, dtype=None, attention_implementation: str | None = None
17
+ ) -> dict:
18
+ foundation_predictor = FoundationPredictor(
19
+ device=device, dtype=dtype, attention_implementation=attention_implementation
20
+ )
21
  return {
22
  "foundation_model": foundation_predictor,
23
  "layout_model": LayoutPredictor(device=device, dtype=dtype),
24
  "recognition_model": RecognitionPredictor(foundation_predictor),
25
  "table_rec_model": TableRecPredictor(device=device, dtype=dtype),
26
  "detection_model": DetectionPredictor(device=device, dtype=dtype),
27
+ "ocr_error_model": OCRErrorPredictor(device=device, dtype=dtype),
28
+ }
marker/schema/blocks/base.py CHANGED
@@ -100,6 +100,7 @@ class Block(BaseModel):
100
  lowres_image: Image.Image | None = None
101
  highres_image: Image.Image | None = None
102
  removed: bool = False # Has block been replaced by new block?
 
103
 
104
  model_config = ConfigDict(arbitrary_types_allowed=True)
105
 
@@ -114,6 +115,16 @@ class Block(BaseModel):
114
  block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
115
  return cls(**block_attrs)
116
 
 
 
 
 
 
 
 
 
 
 
117
  def get_image(
118
  self,
119
  document: Document,
 
100
  lowres_image: Image.Image | None = None
101
  highres_image: Image.Image | None = None
102
  removed: bool = False # Has block been replaced by new block?
103
+ _metadata: Optional[dict] = None
104
 
105
  model_config = ConfigDict(arbitrary_types_allowed=True)
106
 
 
115
  block_attrs = block.model_dump(exclude=["id", "block_id", "block_type"])
116
  return cls(**block_attrs)
117
 
118
+ def set_internal_metadata(self, key, data):
119
+ if self._metadata is None:
120
+ self._metadata = {}
121
+ self._metadata[key] = data
122
+
123
+ def get_internal_metadata(self, key):
124
+ if self._metadata is None:
125
+ return None
126
+ return self._metadata.get(key)
127
+
128
  def get_image(
129
  self,
130
  document: Document,
poetry.lock CHANGED
@@ -1,4 +1,4 @@
1
- # This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand.
2
 
3
  [[package]]
4
  name = "aiohappyeyeballs"
@@ -1652,6 +1652,28 @@ files = [
1652
  {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
1653
  ]
1654
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1655
  [[package]]
1656
  name = "httpcore"
1657
  version = "1.0.7"
@@ -1701,19 +1723,20 @@ zstd = ["zstandard (>=0.18.0)"]
1701
 
1702
  [[package]]
1703
  name = "huggingface-hub"
1704
- version = "0.30.2"
1705
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
1706
  optional = false
1707
  python-versions = ">=3.8.0"
1708
  groups = ["main", "dev"]
1709
  files = [
1710
- {file = "huggingface_hub-0.30.2-py3-none-any.whl", hash = "sha256:68ff05969927058cfa41df4f2155d4bb48f5f54f719dd0390103eefa9b191e28"},
1711
- {file = "huggingface_hub-0.30.2.tar.gz", hash = "sha256:9a7897c5b6fd9dad3168a794a8998d6378210f5b9688d0dfc180b1a228dc2466"},
1712
  ]
1713
 
1714
  [package.dependencies]
1715
  filelock = "*"
1716
  fsspec = ">=2023.5.0"
 
1717
  packaging = ">=20.9"
1718
  pyyaml = ">=5.1"
1719
  requests = "*"
@@ -1721,17 +1744,19 @@ tqdm = ">=4.42.1"
1721
  typing-extensions = ">=3.7.4.3"
1722
 
1723
  [package.extras]
1724
- all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
1725
  cli = ["InquirerPy (==0.3.4)"]
1726
- dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
1727
  fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
1728
  hf-transfer = ["hf-transfer (>=0.1.4)"]
1729
- hf-xet = ["hf-xet (>=0.1.4)"]
1730
  inference = ["aiohttp"]
1731
- quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
 
 
1732
  tensorflow = ["graphviz", "pydot", "tensorflow"]
1733
  tensorflow-testing = ["keras (<3.0)", "tensorflow"]
1734
- testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
1735
  torch = ["safetensors[torch]", "torch"]
1736
  typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
1737
 
@@ -5390,21 +5415,21 @@ snowflake = ["snowflake-connector-python (>=3.3.0) ; python_version < \"3.12\"",
5390
 
5391
  [[package]]
5392
  name = "surya-ocr"
5393
- version = "0.16.0"
5394
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
5395
  optional = false
5396
  python-versions = "<4.0,>=3.10"
5397
  groups = ["main"]
5398
  files = [
5399
- {file = "surya_ocr-0.16.0-py3-none-any.whl", hash = "sha256:57cfd1fb9feec410d5e393455efc0a0352f6e3a96ceabd5ba9aea9b66b4d12aa"},
5400
- {file = "surya_ocr-0.16.0.tar.gz", hash = "sha256:8237a580cb227fce4350eea89acf7b8caf9dd62ad3135c86c020eb44ade448ae"},
5401
  ]
5402
 
5403
  [package.dependencies]
5404
  click = ">=8.1.8,<9.0.0"
5405
  einops = ">=0.8.1,<0.9.0"
5406
  filetype = ">=1.2.0,<2.0.0"
5407
- opencv-python-headless = ">=4.11.0.86,<5.0.0.0"
5408
  pillow = ">=10.2.0,<11.0.0"
5409
  platformdirs = ">=4.3.6,<5.0.0"
5410
  pre-commit = ">=4.2.0,<5.0.0"
@@ -5413,7 +5438,7 @@ pydantic-settings = ">=2.1.0,<3.0.0"
5413
  pypdfium2 = "4.30.0"
5414
  python-dotenv = ">=1.0.0,<2.0.0"
5415
  torch = ">=2.7.0,<3.0.0"
5416
- transformers = ">=4.51.2,<4.54.0"
5417
 
5418
  [[package]]
5419
  name = "sympy"
@@ -5540,27 +5565,27 @@ test = ["pytest", "ruff"]
5540
 
5541
  [[package]]
5542
  name = "tokenizers"
5543
- version = "0.21.1"
5544
  description = ""
5545
  optional = false
5546
  python-versions = ">=3.9"
5547
  groups = ["main"]
5548
  files = [
5549
- {file = "tokenizers-0.21.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:e78e413e9e668ad790a29456e677d9d3aa50a9ad311a40905d6861ba7692cf41"},
5550
- {file = "tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:cd51cd0a91ecc801633829fcd1fda9cf8682ed3477c6243b9a095539de4aecf3"},
5551
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28da6b72d4fb14ee200a1bd386ff74ade8992d7f725f2bde2c495a9a98cf4d9f"},
5552
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:34d8cfde551c9916cb92014e040806122295a6800914bab5865deb85623931cf"},
5553
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaa852d23e125b73d283c98f007e06d4595732104b65402f46e8ef24b588d9f8"},
5554
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a21a15d5c8e603331b8a59548bbe113564136dc0f5ad8306dd5033459a226da0"},
5555
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fdbd4c067c60a0ac7eca14b6bd18a5bebace54eb757c706b47ea93204f7a37c"},
5556
- {file = "tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dd9a0061e403546f7377df940e866c3e678d7d4e9643d0461ea442b4f89e61a"},
5557
- {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:db9484aeb2e200c43b915a1a0150ea885e35f357a5a8fabf7373af333dcc8dbf"},
5558
- {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ed248ab5279e601a30a4d67bdb897ecbe955a50f1e7bb62bd99f07dd11c2f5b6"},
5559
- {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:9ac78b12e541d4ce67b4dfd970e44c060a2147b9b2a21f509566d556a509c67d"},
5560
- {file = "tokenizers-0.21.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e5a69c1a4496b81a5ee5d2c1f3f7fbdf95e90a0196101b0ee89ed9956b8a168f"},
5561
- {file = "tokenizers-0.21.1-cp39-abi3-win32.whl", hash = "sha256:1039a3a5734944e09de1d48761ade94e00d0fa760c0e0551151d4dd851ba63e3"},
5562
- {file = "tokenizers-0.21.1-cp39-abi3-win_amd64.whl", hash = "sha256:0f0dcbcc9f6e13e675a66d7a5f2f225a736745ce484c1a4e07476a89ccdad382"},
5563
- {file = "tokenizers-0.21.1.tar.gz", hash = "sha256:a1bb04dc5b448985f86ecd4b05407f5a8d97cb2c0532199b2a302a604a0165ab"},
5564
  ]
5565
 
5566
  [package.dependencies]
@@ -5569,7 +5594,7 @@ huggingface-hub = ">=0.16.4,<1.0"
5569
  [package.extras]
5570
  dev = ["tokenizers[testing]"]
5571
  docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
5572
- testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
5573
 
5574
  [[package]]
5575
  name = "toml"
@@ -5749,74 +5774,76 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,
5749
 
5750
  [[package]]
5751
  name = "transformers"
5752
- version = "4.51.3"
5753
  description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
5754
  optional = false
5755
  python-versions = ">=3.9.0"
5756
  groups = ["main"]
5757
  files = [
5758
- {file = "transformers-4.51.3-py3-none-any.whl", hash = "sha256:fd3279633ceb2b777013234bbf0b4f5c2d23c4626b05497691f00cfda55e8a83"},
5759
- {file = "transformers-4.51.3.tar.gz", hash = "sha256:e292fcab3990c6defe6328f0f7d2004283ca81a7a07b2de9a46d67fd81ea1409"},
5760
  ]
5761
 
5762
  [package.dependencies]
5763
  filelock = "*"
5764
- huggingface-hub = ">=0.30.0,<1.0"
5765
  numpy = ">=1.17"
5766
  packaging = ">=20.0"
5767
  pyyaml = ">=5.1"
5768
  regex = "!=2019.12.17"
5769
  requests = "*"
5770
  safetensors = ">=0.4.3"
5771
- tokenizers = ">=0.21,<0.22"
5772
  tqdm = ">=4.27"
5773
 
5774
  [package.extras]
5775
  accelerate = ["accelerate (>=0.26.0)"]
5776
- agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=2.0)"]
5777
- all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.3.2,<0.4)", "librosa", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision"]
5778
  audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5779
  benchmark = ["optimum-benchmark (>=0.3.0)"]
 
5780
  codecarbon = ["codecarbon (>=2.8.1)"]
5781
  deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
5782
- deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
5783
- dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.3.2,<0.4)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
5784
- dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.21,<0.22)", "urllib3 (<2.0.0)"]
5785
- dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "kernels (>=0.3.2,<0.4)", "libcst", "librosa", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (<=1.0.11)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"]
5786
  flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
5787
  flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5788
  ftfy = ["ftfy"]
5789
- hf-xet = ["hf-xet"]
5790
- hub-kernels = ["kernels (>=0.3.2,<0.4)"]
5791
- integrations = ["kernels (>=0.3.2,<0.4)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"]
5792
- ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"]
 
5793
  modelcreation = ["cookiecutter (==1.7.3)"]
5794
  natten = ["natten (>=0.14.6,<0.15.0)"]
5795
  num2words = ["num2words"]
5796
  onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
5797
  onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
 
5798
  optuna = ["optuna"]
5799
- quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "isort (>=5.5.4)", "libcst", "rich", "ruff (==0.11.2)", "urllib3 (<2.0.0)"]
5800
  ray = ["ray[tune] (>=2.7.0)"]
5801
- retrieval = ["datasets (!=2.5.0)", "faiss-cpu"]
5802
  ruff = ["ruff (==0.11.2)"]
5803
  sagemaker = ["sagemaker (>=2.31.0)"]
5804
  sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
5805
- serving = ["fastapi", "pydantic", "starlette", "uvicorn"]
5806
  sigopt = ["sigopt"]
5807
  sklearn = ["scikit-learn"]
5808
  speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
5809
- testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "nltk (<=3.8.1)", "parameterized", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
5810
  tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
5811
  tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
5812
  tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5813
  tiktoken = ["blobfile", "tiktoken"]
5814
- timm = ["timm (<=1.0.11)"]
5815
- tokenizers = ["tokenizers (>=0.21,<0.22)"]
5816
- torch = ["accelerate (>=0.26.0)", "torch (>=2.0)"]
5817
  torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
5818
  torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
5819
- torchhub = ["filelock", "huggingface-hub (>=0.30.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.21,<0.22)", "torch (>=2.0)", "tqdm (>=4.27)"]
5820
  video = ["av"]
5821
  vision = ["Pillow (>=10.0.1,<=15.0)"]
5822
 
@@ -6505,4 +6532,4 @@ full = ["ebooklib", "mammoth", "openpyxl", "python-pptx", "weasyprint"]
6505
  [metadata]
6506
  lock-version = "2.1"
6507
  python-versions = "^3.10"
6508
- content-hash = "1ac291f5af0c2aafad6e5dbb13b4eba9628070f6f5310630a870dd45b70ccecf"
 
1
+ # This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand.
2
 
3
  [[package]]
4
  name = "aiohappyeyeballs"
 
1652
  {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
1653
  ]
1654
 
1655
+ [[package]]
1656
+ name = "hf-xet"
1657
+ version = "1.1.9"
1658
+ description = "Fast transfer of large files with the Hugging Face Hub."
1659
+ optional = false
1660
+ python-versions = ">=3.8"
1661
+ groups = ["main", "dev"]
1662
+ markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""
1663
+ files = [
1664
+ {file = "hf_xet-1.1.9-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:a3b6215f88638dd7a6ff82cb4e738dcbf3d863bf667997c093a3c990337d1160"},
1665
+ {file = "hf_xet-1.1.9-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:9b486de7a64a66f9a172f4b3e0dfe79c9f0a93257c501296a2521a13495a698a"},
1666
+ {file = "hf_xet-1.1.9-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4c5a840c2c4e6ec875ed13703a60e3523bc7f48031dfd750923b2a4d1a5fc3c"},
1667
+ {file = "hf_xet-1.1.9-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:96a6139c9e44dad1c52c52520db0fffe948f6bce487cfb9d69c125f254bb3790"},
1668
+ {file = "hf_xet-1.1.9-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ad1022e9a998e784c97b2173965d07fe33ee26e4594770b7785a8cc8f922cd95"},
1669
+ {file = "hf_xet-1.1.9-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:86754c2d6d5afb11b0a435e6e18911a4199262fe77553f8c50d75e21242193ea"},
1670
+ {file = "hf_xet-1.1.9-cp37-abi3-win_amd64.whl", hash = "sha256:5aad3933de6b725d61d51034e04174ed1dce7a57c63d530df0014dea15a40127"},
1671
+ {file = "hf_xet-1.1.9.tar.gz", hash = "sha256:c99073ce404462e909f1d5839b2d14a3827b8fe75ed8aed551ba6609c026c803"},
1672
+ ]
1673
+
1674
+ [package.extras]
1675
+ tests = ["pytest"]
1676
+
1677
  [[package]]
1678
  name = "httpcore"
1679
  version = "1.0.7"
 
1723
 
1724
  [[package]]
1725
  name = "huggingface-hub"
1726
+ version = "0.34.4"
1727
  description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
1728
  optional = false
1729
  python-versions = ">=3.8.0"
1730
  groups = ["main", "dev"]
1731
  files = [
1732
+ {file = "huggingface_hub-0.34.4-py3-none-any.whl", hash = "sha256:9b365d781739c93ff90c359844221beef048403f1bc1f1c123c191257c3c890a"},
1733
+ {file = "huggingface_hub-0.34.4.tar.gz", hash = "sha256:a4228daa6fb001be3f4f4bdaf9a0db00e1739235702848df00885c9b5742c85c"},
1734
  ]
1735
 
1736
  [package.dependencies]
1737
  filelock = "*"
1738
  fsspec = ">=2023.5.0"
1739
+ hf-xet = {version = ">=1.1.3,<2.0.0", markers = "platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"arm64\" or platform_machine == \"aarch64\""}
1740
  packaging = ">=20.9"
1741
  pyyaml = ">=5.1"
1742
  requests = "*"
 
1744
  typing-extensions = ">=3.7.4.3"
1745
 
1746
  [package.extras]
1747
+ all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
1748
  cli = ["InquirerPy (==0.3.4)"]
1749
+ dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
1750
  fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
1751
  hf-transfer = ["hf-transfer (>=0.1.4)"]
1752
+ hf-xet = ["hf-xet (>=1.1.2,<2.0.0)"]
1753
  inference = ["aiohttp"]
1754
+ mcp = ["aiohttp", "mcp (>=1.8.0)", "typer"]
1755
+ oauth = ["authlib (>=1.3.2)", "fastapi", "httpx", "itsdangerous"]
1756
+ quality = ["libcst (>=1.4.0)", "mypy (==1.15.0) ; python_version >= \"3.9\"", "mypy (>=1.14.1,<1.15.0) ; python_version == \"3.8\"", "ruff (>=0.9.0)"]
1757
  tensorflow = ["graphviz", "pydot", "tensorflow"]
1758
  tensorflow-testing = ["keras (<3.0)", "tensorflow"]
1759
+ testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "authlib (>=1.3.2)", "fastapi", "gradio (>=4.0.0)", "httpx", "itsdangerous", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
1760
  torch = ["safetensors[torch]", "torch"]
1761
  typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"]
1762
 
 
5415
 
5416
  [[package]]
5417
  name = "surya-ocr"
5418
+ version = "0.16.7"
5419
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
5420
  optional = false
5421
  python-versions = "<4.0,>=3.10"
5422
  groups = ["main"]
5423
  files = [
5424
+ {file = "surya_ocr-0.16.7-py3-none-any.whl", hash = "sha256:20bf5f7f22832a15bee6a0b65343b439b0317877da997f49ce4ea8d7f595647f"},
5425
+ {file = "surya_ocr-0.16.7.tar.gz", hash = "sha256:bd3561acbbac9fb02069bedae5ec9fda1d7f868b17fd2a59447f79f61893a7cd"},
5426
  ]
5427
 
5428
  [package.dependencies]
5429
  click = ">=8.1.8,<9.0.0"
5430
  einops = ">=0.8.1,<0.9.0"
5431
  filetype = ">=1.2.0,<2.0.0"
5432
+ opencv-python-headless = "4.11.0.86"
5433
  pillow = ">=10.2.0,<11.0.0"
5434
  platformdirs = ">=4.3.6,<5.0.0"
5435
  pre-commit = ">=4.2.0,<5.0.0"
 
5438
  pypdfium2 = "4.30.0"
5439
  python-dotenv = ">=1.0.0,<2.0.0"
5440
  torch = ">=2.7.0,<3.0.0"
5441
+ transformers = ">=4.56.1"
5442
 
5443
  [[package]]
5444
  name = "sympy"
 
5565
 
5566
  [[package]]
5567
  name = "tokenizers"
5568
+ version = "0.22.0"
5569
  description = ""
5570
  optional = false
5571
  python-versions = ">=3.9"
5572
  groups = ["main"]
5573
  files = [
5574
+ {file = "tokenizers-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:eaa9620122a3fb99b943f864af95ed14c8dfc0f47afa3b404ac8c16b3f2bb484"},
5575
+ {file = "tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:71784b9ab5bf0ff3075bceeb198149d2c5e068549c0d18fe32d06ba0deb63f79"},
5576
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec5b71f668a8076802b0241a42387d48289f25435b86b769ae1837cad4172a17"},
5577
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea8562fa7498850d02a16178105b58803ea825b50dc9094d60549a7ed63654bb"},
5578
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4136e1558a9ef2e2f1de1555dcd573e1cbc4a320c1a06c4107a3d46dc8ac6e4b"},
5579
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf5954de3962a5fd9781dc12048d24a1a6f1f5df038c6e95db328cd22964206"},
5580
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8337ca75d0731fc4860e6204cc24bb36a67d9736142aa06ed320943b50b1e7ed"},
5581
+ {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a89264e26f63c449d8cded9061adea7b5de53ba2346fc7e87311f7e4117c1cc8"},
5582
+ {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:790bad50a1b59d4c21592f9c3cf5e5cf9c3c7ce7e1a23a739f13e01fb1be377a"},
5583
+ {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:76cf6757c73a10ef10bf06fa937c0ec7393d90432f543f49adc8cab3fb6f26cb"},
5584
+ {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:1626cb186e143720c62c6c6b5371e62bbc10af60481388c0da89bc903f37ea0c"},
5585
+ {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:da589a61cbfea18ae267723d6b029b84598dc8ca78db9951d8f5beff72d8507c"},
5586
+ {file = "tokenizers-0.22.0-cp39-abi3-win32.whl", hash = "sha256:dbf9d6851bddae3e046fedfb166f47743c1c7bd11c640f0691dd35ef0bcad3be"},
5587
+ {file = "tokenizers-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:c78174859eeaee96021f248a56c801e36bfb6bd5b067f2e95aa82445ca324f00"},
5588
+ {file = "tokenizers-0.22.0.tar.gz", hash = "sha256:2e33b98525be8453f355927f3cab312c36cd3e44f4d7e9e97da2fa94d0a49dcb"},
5589
  ]
5590
 
5591
  [package.dependencies]
 
5594
  [package.extras]
5595
  dev = ["tokenizers[testing]"]
5596
  docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"]
5597
+ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "pytest-asyncio", "requests", "ruff"]
5598
 
5599
  [[package]]
5600
  name = "toml"
 
5774
 
5775
  [[package]]
5776
  name = "transformers"
5777
+ version = "4.56.1"
5778
  description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
5779
  optional = false
5780
  python-versions = ">=3.9.0"
5781
  groups = ["main"]
5782
  files = [
5783
+ {file = "transformers-4.56.1-py3-none-any.whl", hash = "sha256:1697af6addfb6ddbce9618b763f4b52d5a756f6da4899ffd1b4febf58b779248"},
5784
+ {file = "transformers-4.56.1.tar.gz", hash = "sha256:0d88b1089a563996fc5f2c34502f10516cad3ea1aa89f179f522b54c8311fe74"},
5785
  ]
5786
 
5787
  [package.dependencies]
5788
  filelock = "*"
5789
+ huggingface-hub = ">=0.34.0,<1.0"
5790
  numpy = ">=1.17"
5791
  packaging = ">=20.0"
5792
  pyyaml = ">=5.1"
5793
  regex = "!=2019.12.17"
5794
  requests = "*"
5795
  safetensors = ">=0.4.3"
5796
+ tokenizers = ">=0.22.0,<=0.23.0"
5797
  tqdm = ">=4.27"
5798
 
5799
  [package.extras]
5800
  accelerate = ["accelerate (>=0.26.0)"]
5801
+ all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"]
 
5802
  audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5803
  benchmark = ["optimum-benchmark (>=0.3.0)"]
5804
+ chat-template = ["jinja2 (>=3.1.0)"]
5805
  codecarbon = ["codecarbon (>=2.8.1)"]
5806
  deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"]
5807
+ deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
5808
+ dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
5809
+ dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"]
5810
+ dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"]
5811
  flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"]
5812
  flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5813
  ftfy = ["ftfy"]
5814
+ hf-xet = ["hf_xet"]
5815
+ hub-kernels = ["kernels (>=0.6.1,<=0.9)"]
5816
+ integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"]
5817
+ ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"]
5818
+ mistral-common = ["mistral-common[opencv] (>=1.6.3)"]
5819
  modelcreation = ["cookiecutter (==1.7.3)"]
5820
  natten = ["natten (>=0.14.6,<0.15.0)"]
5821
  num2words = ["num2words"]
5822
  onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"]
5823
  onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"]
5824
+ open-telemetry = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"]
5825
  optuna = ["optuna"]
5826
+ quality = ["GitPython (<3.1.19)", "datasets (>=2.15.0)", "libcst", "pandas (<2.3.0)", "rich", "ruff (==0.11.2)", "urllib3 (<2.0.0)"]
5827
  ray = ["ray[tune] (>=2.7.0)"]
5828
+ retrieval = ["datasets (>=2.15.0)", "faiss-cpu"]
5829
  ruff = ["ruff (==0.11.2)"]
5830
  sagemaker = ["sagemaker (>=2.31.0)"]
5831
  sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"]
5832
+ serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (>=2)", "starlette", "torch (>=2.2)", "uvicorn"]
5833
  sigopt = ["sigopt"]
5834
  sklearn = ["scikit-learn"]
5835
  speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
5836
+ testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"]
5837
  tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"]
5838
  tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"]
5839
  tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"]
5840
  tiktoken = ["blobfile", "tiktoken"]
5841
+ timm = ["timm (!=1.0.18,<=1.0.19)"]
5842
+ tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"]
5843
+ torch = ["accelerate (>=0.26.0)", "torch (>=2.2)"]
5844
  torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"]
5845
  torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"]
5846
+ torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"]
5847
  video = ["av"]
5848
  vision = ["Pillow (>=10.0.1,<=15.0)"]
5849
 
 
6532
  [metadata]
6533
  lock-version = "2.1"
6534
  python-versions = "^3.10"
6535
+ content-hash = "ca3704b625f021f3b09ad23df31ad3b48de68b2bf0df0fc144babcb6f5010767"
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "1.9.2"
4
  description = "Convert documents to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
@@ -26,7 +26,7 @@ torch = "^2.7.0"
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
- surya-ocr = "^0.16.0"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.6.3"
32
  markdownify = "^1.1.0"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "1.9.3"
4
  description = "Convert documents to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
26
  tqdm = "^4.66.1"
27
  ftfy = "^6.1.1"
28
  rapidfuzz = "^3.8.1"
29
+ surya-ocr = "^0.16.7"
30
  regex = "^2024.4.28"
31
  pdftext = "~0.6.3"
32
  markdownify = "^1.1.0"