Vik Paruchuri
commited on
Commit
·
e194dc9
1
Parent(s):
9202ab9
Add test for stripping OCR
Browse files- README.md +5 -1
- marker_app.py +4 -0
- poetry.lock +45 -142
- pyproject.toml +1 -1
- tests/builders/test_strip_existing_ocr.py +16 -0
README.md
CHANGED
|
@@ -5,7 +5,7 @@ Marker converts PDFs to markdown, JSON, and HTML quickly and accurately.
|
|
| 5 |
- Supports a wide range of documents
|
| 6 |
- Supports all languages
|
| 7 |
- Removes headers/footers/other artifacts
|
| 8 |
-
- Formats tables and code blocks
|
| 9 |
- Extracts and saves images along with the markdown
|
| 10 |
- Converts equations to latex
|
| 11 |
- Easily extensible with your own formatting and logic
|
|
@@ -19,6 +19,7 @@ Marker is a pipeline of deep learning models:
|
|
| 19 |
- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
|
| 20 |
- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
|
| 21 |
- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify). [tabled](https://github.com/VikParuchuri/tabled))
|
|
|
|
| 22 |
- Combine blocks and postprocess complete text
|
| 23 |
|
| 24 |
It only uses models where necessary, which improves speed and accuracy.
|
|
@@ -66,6 +67,8 @@ PDF is a tricky format, so marker will not always work perfectly. Here are some
|
|
| 66 |
- Forms are not converted optimally
|
| 67 |
- Very complex layouts, with nested tables and forms, may not work
|
| 68 |
|
|
|
|
|
|
|
| 69 |
# Installation
|
| 70 |
|
| 71 |
You'll need python 3.10+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine. See [here](https://pytorch.org/get-started/locally/) for more details.
|
|
@@ -105,6 +108,7 @@ Options:
|
|
| 105 |
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
|
| 106 |
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
|
| 107 |
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
|
|
|
|
| 108 |
- `--debug`: Enable debug mode for additional logging and diagnostic information.
|
| 109 |
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
|
| 110 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
|
|
|
| 5 |
- Supports a wide range of documents
|
| 6 |
- Supports all languages
|
| 7 |
- Removes headers/footers/other artifacts
|
| 8 |
+
- Formats tables, forms, and code blocks
|
| 9 |
- Extracts and saves images along with the markdown
|
| 10 |
- Converts equations to latex
|
| 11 |
- Easily extensible with your own formatting and logic
|
|
|
|
| 19 |
- Extract text, OCR if necessary (heuristics, [surya](https://github.com/VikParuchuri/surya))
|
| 20 |
- Detect page layout and find reading order ([surya](https://github.com/VikParuchuri/surya))
|
| 21 |
- Clean and format each block (heuristics, [texify](https://github.com/VikParuchuri/texify). [tabled](https://github.com/VikParuchuri/tabled))
|
| 22 |
+
- Optionally use an LLM to improve quality
|
| 23 |
- Combine blocks and postprocess complete text
|
| 24 |
|
| 25 |
It only uses models where necessary, which improves speed and accuracy.
|
|
|
|
| 67 |
- Forms are not converted optimally
|
| 68 |
- Very complex layouts, with nested tables and forms, may not work
|
| 69 |
|
| 70 |
+
Note: Passing the `--use_llm` flag will mostly solve all of these issues.
|
| 71 |
+
|
| 72 |
# Installation
|
| 73 |
|
| 74 |
You'll need python 3.10+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine. See [here](https://pytorch.org/get-started/locally/) for more details.
|
|
|
|
| 108 |
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
|
| 109 |
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
|
| 110 |
- `--force_ocr`: Force OCR processing on the entire document, even for pages that might contain extractable text.
|
| 111 |
+
- `--strip_existing_ocr`: Remove all existing OCR text in the document and re-OCR with surya.
|
| 112 |
- `--debug`: Enable debug mode for additional logging and diagnostic information.
|
| 113 |
- `--processors TEXT`: Override the default processors by providing their full module paths, separated by commas. Example: `--processors "module1.processor1,module2.processor2"`
|
| 114 |
- `--config_json PATH`: Path to a JSON configuration file containing additional settings.
|
marker_app.py
CHANGED
|
@@ -114,7 +114,9 @@ page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,
|
|
| 114 |
output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
|
| 115 |
run_marker = st.sidebar.button("Run Marker")
|
| 116 |
|
|
|
|
| 117 |
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
|
|
|
|
| 118 |
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
|
| 119 |
|
| 120 |
if not run_marker:
|
|
@@ -131,6 +133,8 @@ with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb+") as temp_pdf:
|
|
| 131 |
"force_ocr": force_ocr,
|
| 132 |
"debug": debug,
|
| 133 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
|
|
|
|
|
|
| 134 |
}
|
| 135 |
config_parser = ConfigParser(cli_options)
|
| 136 |
rendered = convert_pdf(
|
|
|
|
| 114 |
output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
|
| 115 |
run_marker = st.sidebar.button("Run Marker")
|
| 116 |
|
| 117 |
+
use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
|
| 118 |
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
|
| 119 |
+
strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
|
| 120 |
debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
|
| 121 |
|
| 122 |
if not run_marker:
|
|
|
|
| 133 |
"force_ocr": force_ocr,
|
| 134 |
"debug": debug,
|
| 135 |
"output_dir": settings.DEBUG_DATA_FOLDER if debug else None,
|
| 136 |
+
"use_llm": use_llm,
|
| 137 |
+
"strip_existing_ocr": strip_existing_ocr
|
| 138 |
}
|
| 139 |
config_parser = ConfigParser(cli_options)
|
| 140 |
rendered = convert_pdf(
|
poetry.lock
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# This file is automatically @generated by Poetry 1.8.
|
| 2 |
|
| 3 |
[[package]]
|
| 4 |
name = "aiohappyeyeballs"
|
|
@@ -373,6 +373,7 @@ files = [
|
|
| 373 |
]
|
| 374 |
|
| 375 |
[package.dependencies]
|
|
|
|
| 376 |
webencodings = "*"
|
| 377 |
|
| 378 |
[package.extras]
|
|
@@ -1008,13 +1009,13 @@ wcwidth = "*"
|
|
| 1008 |
|
| 1009 |
[[package]]
|
| 1010 |
name = "gitdb"
|
| 1011 |
-
version = "4.0.
|
| 1012 |
description = "Git Object Database"
|
| 1013 |
optional = false
|
| 1014 |
python-versions = ">=3.7"
|
| 1015 |
files = [
|
| 1016 |
-
{file = "gitdb-4.0.
|
| 1017 |
-
{file = "gitdb-4.0.
|
| 1018 |
]
|
| 1019 |
|
| 1020 |
[package.dependencies]
|
|
@@ -1022,20 +1023,20 @@ smmap = ">=3.0.1,<6"
|
|
| 1022 |
|
| 1023 |
[[package]]
|
| 1024 |
name = "gitpython"
|
| 1025 |
-
version = "3.1.
|
| 1026 |
description = "GitPython is a Python library used to interact with Git repositories"
|
| 1027 |
optional = false
|
| 1028 |
python-versions = ">=3.7"
|
| 1029 |
files = [
|
| 1030 |
-
{file = "GitPython-3.1.
|
| 1031 |
-
{file = "
|
| 1032 |
]
|
| 1033 |
|
| 1034 |
[package.dependencies]
|
| 1035 |
gitdb = ">=4.0.1,<5"
|
| 1036 |
|
| 1037 |
[package.extras]
|
| 1038 |
-
doc = ["sphinx (
|
| 1039 |
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
|
| 1040 |
|
| 1041 |
[[package]]
|
|
@@ -2237,18 +2238,18 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=
|
|
| 2237 |
|
| 2238 |
[[package]]
|
| 2239 |
name = "nbconvert"
|
| 2240 |
-
version = "7.16.
|
| 2241 |
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
|
| 2242 |
optional = false
|
| 2243 |
python-versions = ">=3.8"
|
| 2244 |
files = [
|
| 2245 |
-
{file = "nbconvert-7.16.
|
| 2246 |
-
{file = "nbconvert-7.16.
|
| 2247 |
]
|
| 2248 |
|
| 2249 |
[package.dependencies]
|
| 2250 |
beautifulsoup4 = "*"
|
| 2251 |
-
bleach = "!=5.0.0"
|
| 2252 |
defusedxml = "*"
|
| 2253 |
jinja2 = ">=3.0"
|
| 2254 |
jupyter-core = ">=4.7"
|
|
@@ -2260,7 +2261,6 @@ nbformat = ">=5.7"
|
|
| 2260 |
packaging = "*"
|
| 2261 |
pandocfilters = ">=1.4.1"
|
| 2262 |
pygments = ">=2.4.1"
|
| 2263 |
-
tinycss2 = "*"
|
| 2264 |
traitlets = ">=5.1"
|
| 2265 |
|
| 2266 |
[package.extras]
|
|
@@ -3066,8 +3066,6 @@ files = [
|
|
| 3066 |
{file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"},
|
| 3067 |
{file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"},
|
| 3068 |
{file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"},
|
| 3069 |
-
{file = "psutil-6.1.1-cp27-none-win32.whl", hash = "sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac"},
|
| 3070 |
-
{file = "psutil-6.1.1-cp27-none-win_amd64.whl", hash = "sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030"},
|
| 3071 |
{file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"},
|
| 3072 |
{file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"},
|
| 3073 |
{file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"},
|
|
@@ -3333,13 +3331,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
|
| 3333 |
|
| 3334 |
[[package]]
|
| 3335 |
name = "pydantic-settings"
|
| 3336 |
-
version = "2.7.
|
| 3337 |
description = "Settings management using Pydantic"
|
| 3338 |
optional = false
|
| 3339 |
python-versions = ">=3.8"
|
| 3340 |
files = [
|
| 3341 |
-
{file = "pydantic_settings-2.7.
|
| 3342 |
-
{file = "pydantic_settings-2.7.
|
| 3343 |
]
|
| 3344 |
|
| 3345 |
[package.dependencies]
|
|
@@ -3386,13 +3384,13 @@ windows-terminal = ["colorama (>=0.4.6)"]
|
|
| 3386 |
|
| 3387 |
[[package]]
|
| 3388 |
name = "pyparsing"
|
| 3389 |
-
version = "3.2.
|
| 3390 |
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
|
| 3391 |
optional = false
|
| 3392 |
python-versions = ">=3.9"
|
| 3393 |
files = [
|
| 3394 |
-
{file = "pyparsing-3.2.
|
| 3395 |
-
{file = "pyparsing-3.2.
|
| 3396 |
]
|
| 3397 |
|
| 3398 |
[package.extras]
|
|
@@ -4159,121 +4157,26 @@ pyasn1 = ">=0.1.3"
|
|
| 4159 |
|
| 4160 |
[[package]]
|
| 4161 |
name = "safetensors"
|
| 4162 |
-
version = "0.
|
| 4163 |
description = ""
|
| 4164 |
optional = false
|
| 4165 |
python-versions = ">=3.7"
|
| 4166 |
files = [
|
| 4167 |
-
{file = "safetensors-0.
|
| 4168 |
-
{file = "safetensors-0.
|
| 4169 |
-
{file = "safetensors-0.
|
| 4170 |
-
{file = "safetensors-0.
|
| 4171 |
-
{file = "safetensors-0.
|
| 4172 |
-
{file = "safetensors-0.
|
| 4173 |
-
{file = "safetensors-0.
|
| 4174 |
-
{file = "safetensors-0.
|
| 4175 |
-
{file = "safetensors-0.
|
| 4176 |
-
{file = "safetensors-0.
|
| 4177 |
-
{file = "safetensors-0.
|
| 4178 |
-
{file = "safetensors-0.
|
| 4179 |
-
{file = "safetensors-0.
|
| 4180 |
-
{file = "safetensors-0.
|
| 4181 |
-
{file = "safetensors-0.
|
| 4182 |
-
{file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:59b77e4b7a708988d84f26de3ebead61ef1659c73dcbc9946c18f3b1786d2688"},
|
| 4183 |
-
{file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d3bc83e14d67adc2e9387e511097f254bd1b43c3020440e708858c684cbac68"},
|
| 4184 |
-
{file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39371fc551c1072976073ab258c3119395294cf49cdc1f8476794627de3130df"},
|
| 4185 |
-
{file = "safetensors-0.4.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6c19feda32b931cae0acd42748a670bdf56bee6476a046af20181ad3fee4090"},
|
| 4186 |
-
{file = "safetensors-0.4.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a659467495de201e2f282063808a41170448c78bada1e62707b07a27b05e6943"},
|
| 4187 |
-
{file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bad5e4b2476949bcd638a89f71b6916fa9a5cae5c1ae7eede337aca2100435c0"},
|
| 4188 |
-
{file = "safetensors-0.4.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:a3a315a6d0054bc6889a17f5668a73f94f7fe55121ff59e0a199e3519c08565f"},
|
| 4189 |
-
{file = "safetensors-0.4.5-cp311-none-win32.whl", hash = "sha256:a01e232e6d3d5cf8b1667bc3b657a77bdab73f0743c26c1d3c5dd7ce86bd3a92"},
|
| 4190 |
-
{file = "safetensors-0.4.5-cp311-none-win_amd64.whl", hash = "sha256:cbd39cae1ad3e3ef6f63a6f07296b080c951f24cec60188378e43d3713000c04"},
|
| 4191 |
-
{file = "safetensors-0.4.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:473300314e026bd1043cef391bb16a8689453363381561b8a3e443870937cc1e"},
|
| 4192 |
-
{file = "safetensors-0.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:801183a0f76dc647f51a2d9141ad341f9665602a7899a693207a82fb102cc53e"},
|
| 4193 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1524b54246e422ad6fb6aea1ac71edeeb77666efa67230e1faf6999df9b2e27f"},
|
| 4194 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3139098e3e8b2ad7afbca96d30ad29157b50c90861084e69fcb80dec7430461"},
|
| 4195 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65573dc35be9059770808e276b017256fa30058802c29e1038eb1c00028502ea"},
|
| 4196 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd33da8e9407559f8779c82a0448e2133737f922d71f884da27184549416bfed"},
|
| 4197 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3685ce7ed036f916316b567152482b7e959dc754fcc4a8342333d222e05f407c"},
|
| 4198 |
-
{file = "safetensors-0.4.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dde2bf390d25f67908278d6f5d59e46211ef98e44108727084d4637ee70ab4f1"},
|
| 4199 |
-
{file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7469d70d3de970b1698d47c11ebbf296a308702cbaae7fcb993944751cf985f4"},
|
| 4200 |
-
{file = "safetensors-0.4.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3a6ba28118636a130ccbb968bc33d4684c48678695dba2590169d5ab03a45646"},
|
| 4201 |
-
{file = "safetensors-0.4.5-cp312-none-win32.whl", hash = "sha256:c859c7ed90b0047f58ee27751c8e56951452ed36a67afee1b0a87847d065eec6"},
|
| 4202 |
-
{file = "safetensors-0.4.5-cp312-none-win_amd64.whl", hash = "sha256:b5a8810ad6a6f933fff6c276eae92c1da217b39b4d8b1bc1c0b8af2d270dc532"},
|
| 4203 |
-
{file = "safetensors-0.4.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:25e5f8e2e92a74f05b4ca55686234c32aac19927903792b30ee6d7bd5653d54e"},
|
| 4204 |
-
{file = "safetensors-0.4.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:81efb124b58af39fcd684254c645e35692fea81c51627259cdf6d67ff4458916"},
|
| 4205 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:585f1703a518b437f5103aa9cf70e9bd437cb78eea9c51024329e4fb8a3e3679"},
|
| 4206 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4b99fbf72e3faf0b2f5f16e5e3458b93b7d0a83984fe8d5364c60aa169f2da89"},
|
| 4207 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b17b299ca9966ca983ecda1c0791a3f07f9ca6ab5ded8ef3d283fff45f6bcd5f"},
|
| 4208 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:76ded72f69209c9780fdb23ea89e56d35c54ae6abcdec67ccb22af8e696e449a"},
|
| 4209 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2783956926303dcfeb1de91a4d1204cd4089ab441e622e7caee0642281109db3"},
|
| 4210 |
-
{file = "safetensors-0.4.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d94581aab8c6b204def4d7320f07534d6ee34cd4855688004a4354e63b639a35"},
|
| 4211 |
-
{file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:67e1e7cb8678bb1b37ac48ec0df04faf689e2f4e9e81e566b5c63d9f23748523"},
|
| 4212 |
-
{file = "safetensors-0.4.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:dbd280b07e6054ea68b0cb4b16ad9703e7d63cd6890f577cb98acc5354780142"},
|
| 4213 |
-
{file = "safetensors-0.4.5-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:77d9b228da8374c7262046a36c1f656ba32a93df6cc51cd4453af932011e77f1"},
|
| 4214 |
-
{file = "safetensors-0.4.5-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:500cac01d50b301ab7bb192353317035011c5ceeef0fca652f9f43c000bb7f8d"},
|
| 4215 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75331c0c746f03158ded32465b7d0b0e24c5a22121743662a2393439c43a45cf"},
|
| 4216 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:670e95fe34e0d591d0529e5e59fd9d3d72bc77b1444fcaa14dccda4f36b5a38b"},
|
| 4217 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:098923e2574ff237c517d6e840acada8e5b311cb1fa226019105ed82e9c3b62f"},
|
| 4218 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:13ca0902d2648775089fa6a0c8fc9e6390c5f8ee576517d33f9261656f851e3f"},
|
| 4219 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f0032bedc869c56f8d26259fe39cd21c5199cd57f2228d817a0e23e8370af25"},
|
| 4220 |
-
{file = "safetensors-0.4.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f4b15f51b4f8f2a512341d9ce3475cacc19c5fdfc5db1f0e19449e75f95c7dc8"},
|
| 4221 |
-
{file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f6594d130d0ad933d885c6a7b75c5183cb0e8450f799b80a39eae2b8508955eb"},
|
| 4222 |
-
{file = "safetensors-0.4.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:60c828a27e852ded2c85fc0f87bf1ec20e464c5cd4d56ff0e0711855cc2e17f8"},
|
| 4223 |
-
{file = "safetensors-0.4.5-cp37-none-win32.whl", hash = "sha256:6d3de65718b86c3eeaa8b73a9c3d123f9307a96bbd7be9698e21e76a56443af5"},
|
| 4224 |
-
{file = "safetensors-0.4.5-cp37-none-win_amd64.whl", hash = "sha256:5a2d68a523a4cefd791156a4174189a4114cf0bf9c50ceb89f261600f3b2b81a"},
|
| 4225 |
-
{file = "safetensors-0.4.5-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:e7a97058f96340850da0601a3309f3d29d6191b0702b2da201e54c6e3e44ccf0"},
|
| 4226 |
-
{file = "safetensors-0.4.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:63bfd425e25f5c733f572e2246e08a1c38bd6f2e027d3f7c87e2e43f228d1345"},
|
| 4227 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3664ac565d0e809b0b929dae7ccd74e4d3273cd0c6d1220c6430035befb678e"},
|
| 4228 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:313514b0b9b73ff4ddfb4edd71860696dbe3c1c9dc4d5cc13dbd74da283d2cbf"},
|
| 4229 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31fa33ee326f750a2f2134a6174773c281d9a266ccd000bd4686d8021f1f3dac"},
|
| 4230 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:09566792588d77b68abe53754c9f1308fadd35c9f87be939e22c623eaacbed6b"},
|
| 4231 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309aaec9b66cbf07ad3a2e5cb8a03205663324fea024ba391594423d0f00d9fe"},
|
| 4232 |
-
{file = "safetensors-0.4.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:53946c5813b8f9e26103c5efff4a931cc45d874f45229edd68557ffb35ffb9f8"},
|
| 4233 |
-
{file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:868f9df9e99ad1e7f38c52194063a982bc88fedc7d05096f4f8160403aaf4bd6"},
|
| 4234 |
-
{file = "safetensors-0.4.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9cc9449bd0b0bc538bd5e268221f0c5590bc5c14c1934a6ae359d44410dc68c4"},
|
| 4235 |
-
{file = "safetensors-0.4.5-cp38-none-win32.whl", hash = "sha256:83c4f13a9e687335c3928f615cd63a37e3f8ef072a3f2a0599fa09f863fb06a2"},
|
| 4236 |
-
{file = "safetensors-0.4.5-cp38-none-win_amd64.whl", hash = "sha256:b98d40a2ffa560653f6274e15b27b3544e8e3713a44627ce268f419f35c49478"},
|
| 4237 |
-
{file = "safetensors-0.4.5-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:cf727bb1281d66699bef5683b04d98c894a2803442c490a8d45cd365abfbdeb2"},
|
| 4238 |
-
{file = "safetensors-0.4.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96f1d038c827cdc552d97e71f522e1049fef0542be575421f7684756a748e457"},
|
| 4239 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:139fbee92570ecea774e6344fee908907db79646d00b12c535f66bc78bd5ea2c"},
|
| 4240 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c36302c1c69eebb383775a89645a32b9d266878fab619819ce660309d6176c9b"},
|
| 4241 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d641f5b8149ea98deb5ffcf604d764aad1de38a8285f86771ce1abf8e74c4891"},
|
| 4242 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b4db6a61d968de73722b858038c616a1bebd4a86abe2688e46ca0cc2d17558f2"},
|
| 4243 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b75a616e02f21b6f1d5785b20cecbab5e2bd3f6358a90e8925b813d557666ec1"},
|
| 4244 |
-
{file = "safetensors-0.4.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:788ee7d04cc0e0e7f944c52ff05f52a4415b312f5efd2ee66389fb7685ee030c"},
|
| 4245 |
-
{file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87bc42bd04fd9ca31396d3ca0433db0be1411b6b53ac5a32b7845a85d01ffc2e"},
|
| 4246 |
-
{file = "safetensors-0.4.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4037676c86365a721a8c9510323a51861d703b399b78a6b4486a54a65a975fca"},
|
| 4247 |
-
{file = "safetensors-0.4.5-cp39-none-win32.whl", hash = "sha256:1500418454529d0ed5c1564bda376c4ddff43f30fce9517d9bee7bcce5a8ef50"},
|
| 4248 |
-
{file = "safetensors-0.4.5-cp39-none-win_amd64.whl", hash = "sha256:9d1a94b9d793ed8fe35ab6d5cea28d540a46559bafc6aae98f30ee0867000cab"},
|
| 4249 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fdadf66b5a22ceb645d5435a0be7a0292ce59648ca1d46b352f13cff3ea80410"},
|
| 4250 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d42ffd4c2259f31832cb17ff866c111684c87bd930892a1ba53fed28370c918c"},
|
| 4251 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd8a1f6d2063a92cd04145c7fd9e31a1c7d85fbec20113a14b487563fdbc0597"},
|
| 4252 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:951d2fcf1817f4fb0ef0b48f6696688a4e852a95922a042b3f96aaa67eedc920"},
|
| 4253 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6ac85d9a8c1af0e3132371d9f2d134695a06a96993c2e2f0bbe25debb9e3f67a"},
|
| 4254 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e3cec4a29eb7fe8da0b1c7988bc3828183080439dd559f720414450de076fcab"},
|
| 4255 |
-
{file = "safetensors-0.4.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:21742b391b859e67b26c0b2ac37f52c9c0944a879a25ad2f9f9f3cd61e7fda8f"},
|
| 4256 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c7db3006a4915151ce1913652e907cdede299b974641a83fbc092102ac41b644"},
|
| 4257 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f68bf99ea970960a237f416ea394e266e0361895753df06e3e06e6ea7907d98b"},
|
| 4258 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8158938cf3324172df024da511839d373c40fbfaa83e9abf467174b2910d7b4c"},
|
| 4259 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:540ce6c4bf6b58cb0fd93fa5f143bc0ee341c93bb4f9287ccd92cf898cc1b0dd"},
|
| 4260 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:bfeaa1a699c6b9ed514bd15e6a91e74738b71125a9292159e3d6b7f0a53d2cde"},
|
| 4261 |
-
{file = "safetensors-0.4.5-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:01c8f00da537af711979e1b42a69a8ec9e1d7112f208e0e9b8a35d2c381085ef"},
|
| 4262 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a0dd565f83b30f2ca79b5d35748d0d99dd4b3454f80e03dfb41f0038e3bdf180"},
|
| 4263 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:023b6e5facda76989f4cba95a861b7e656b87e225f61811065d5c501f78cdb3f"},
|
| 4264 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9633b663393d5796f0b60249549371e392b75a0b955c07e9c6f8708a87fc841f"},
|
| 4265 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78dd8adfb48716233c45f676d6e48534d34b4bceb50162c13d1f0bdf6f78590a"},
|
| 4266 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8e8deb16c4321d61ae72533b8451ec4a9af8656d1c61ff81aa49f966406e4b68"},
|
| 4267 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:52452fa5999dc50c4decaf0c53aa28371f7f1e0fe5c2dd9129059fbe1e1599c7"},
|
| 4268 |
-
{file = "safetensors-0.4.5-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d5f23198821e227cfc52d50fa989813513db381255c6d100927b012f0cfec63d"},
|
| 4269 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f4beb84b6073b1247a773141a6331117e35d07134b3bb0383003f39971d414bb"},
|
| 4270 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:68814d599d25ed2fdd045ed54d370d1d03cf35e02dce56de44c651f828fb9b7b"},
|
| 4271 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b6453c54c57c1781292c46593f8a37254b8b99004c68d6c3ce229688931a22"},
|
| 4272 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:adaa9c6dead67e2dd90d634f89131e43162012479d86e25618e821a03d1eb1dc"},
|
| 4273 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73e7d408e9012cd17511b382b43547850969c7979efc2bc353f317abaf23c84c"},
|
| 4274 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:775409ce0fcc58b10773fdb4221ed1eb007de10fe7adbdf8f5e8a56096b6f0bc"},
|
| 4275 |
-
{file = "safetensors-0.4.5-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:834001bed193e4440c4a3950a31059523ee5090605c907c66808664c932b549c"},
|
| 4276 |
-
{file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
|
| 4277 |
]
|
| 4278 |
|
| 4279 |
[package.extras]
|
|
@@ -4283,7 +4186,7 @@ jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "safetensors[num
|
|
| 4283 |
mlx = ["mlx (>=0.0.9)"]
|
| 4284 |
numpy = ["numpy (>=1.21.6)"]
|
| 4285 |
paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
|
| 4286 |
-
pinned-tf = ["safetensors[numpy]", "tensorflow (==2.
|
| 4287 |
quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
|
| 4288 |
tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
|
| 4289 |
testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
|
|
@@ -4442,13 +4345,13 @@ files = [
|
|
| 4442 |
|
| 4443 |
[[package]]
|
| 4444 |
name = "smmap"
|
| 4445 |
-
version = "5.0.
|
| 4446 |
description = "A pure Python implementation of a sliding window memory map manager"
|
| 4447 |
optional = false
|
| 4448 |
python-versions = ">=3.7"
|
| 4449 |
files = [
|
| 4450 |
-
{file = "smmap-5.0.
|
| 4451 |
-
{file = "smmap-5.0.
|
| 4452 |
]
|
| 4453 |
|
| 4454 |
[[package]]
|
|
@@ -4546,24 +4449,24 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
|
|
| 4546 |
|
| 4547 |
[[package]]
|
| 4548 |
name = "surya-ocr"
|
| 4549 |
-
version = "0.8.
|
| 4550 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4551 |
optional = false
|
| 4552 |
python-versions = "<4.0,>=3.10"
|
| 4553 |
files = [
|
| 4554 |
-
{file = "surya_ocr-0.8.
|
| 4555 |
-
{file = "surya_ocr-0.8.
|
| 4556 |
]
|
| 4557 |
|
| 4558 |
[package.dependencies]
|
| 4559 |
filetype = ">=1.2.0,<2.0.0"
|
| 4560 |
ftfy = ">=6.1.3,<7.0.0"
|
| 4561 |
opencv-python = ">=4.9.0.80,<5.0.0.0"
|
| 4562 |
-
pdftext = ">=0.4.
|
| 4563 |
pillow = ">=10.2.0,<11.0.0"
|
| 4564 |
pydantic = ">=2.5.3,<3.0.0"
|
| 4565 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4566 |
-
pypdfium2 = "
|
| 4567 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4568 |
tabulate = ">=0.9.0,<0.10.0"
|
| 4569 |
torch = ">=2.4.1,<3.0.0"
|
|
@@ -5417,4 +5320,4 @@ propcache = ">=0.2.0"
|
|
| 5417 |
[metadata]
|
| 5418 |
lock-version = "2.0"
|
| 5419 |
python-versions = "^3.10"
|
| 5420 |
-
content-hash = "
|
|
|
|
| 1 |
+
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
|
| 2 |
|
| 3 |
[[package]]
|
| 4 |
name = "aiohappyeyeballs"
|
|
|
|
| 373 |
]
|
| 374 |
|
| 375 |
[package.dependencies]
|
| 376 |
+
tinycss2 = {version = ">=1.1.0,<1.5", optional = true, markers = "extra == \"css\""}
|
| 377 |
webencodings = "*"
|
| 378 |
|
| 379 |
[package.extras]
|
|
|
|
| 1009 |
|
| 1010 |
[[package]]
|
| 1011 |
name = "gitdb"
|
| 1012 |
+
version = "4.0.12"
|
| 1013 |
description = "Git Object Database"
|
| 1014 |
optional = false
|
| 1015 |
python-versions = ">=3.7"
|
| 1016 |
files = [
|
| 1017 |
+
{file = "gitdb-4.0.12-py3-none-any.whl", hash = "sha256:67073e15955400952c6565cc3e707c554a4eea2e428946f7a4c162fab9bd9bcf"},
|
| 1018 |
+
{file = "gitdb-4.0.12.tar.gz", hash = "sha256:5ef71f855d191a3326fcfbc0d5da835f26b13fbcba60c32c21091c349ffdb571"},
|
| 1019 |
]
|
| 1020 |
|
| 1021 |
[package.dependencies]
|
|
|
|
| 1023 |
|
| 1024 |
[[package]]
|
| 1025 |
name = "gitpython"
|
| 1026 |
+
version = "3.1.44"
|
| 1027 |
description = "GitPython is a Python library used to interact with Git repositories"
|
| 1028 |
optional = false
|
| 1029 |
python-versions = ">=3.7"
|
| 1030 |
files = [
|
| 1031 |
+
{file = "GitPython-3.1.44-py3-none-any.whl", hash = "sha256:9e0e10cda9bed1ee64bc9a6de50e7e38a9c9943241cd7f585f6df3ed28011110"},
|
| 1032 |
+
{file = "gitpython-3.1.44.tar.gz", hash = "sha256:c87e30b26253bf5418b01b0660f818967f3c503193838337fe5e573331249269"},
|
| 1033 |
]
|
| 1034 |
|
| 1035 |
[package.dependencies]
|
| 1036 |
gitdb = ">=4.0.1,<5"
|
| 1037 |
|
| 1038 |
[package.extras]
|
| 1039 |
+
doc = ["sphinx (>=7.1.2,<7.2)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"]
|
| 1040 |
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
|
| 1041 |
|
| 1042 |
[[package]]
|
|
|
|
| 2238 |
|
| 2239 |
[[package]]
|
| 2240 |
name = "nbconvert"
|
| 2241 |
+
version = "7.16.5"
|
| 2242 |
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
|
| 2243 |
optional = false
|
| 2244 |
python-versions = ">=3.8"
|
| 2245 |
files = [
|
| 2246 |
+
{file = "nbconvert-7.16.5-py3-none-any.whl", hash = "sha256:e12eac052d6fd03040af4166c563d76e7aeead2e9aadf5356db552a1784bd547"},
|
| 2247 |
+
{file = "nbconvert-7.16.5.tar.gz", hash = "sha256:c83467bb5777fdfaac5ebbb8e864f300b277f68692ecc04d6dab72f2d8442344"},
|
| 2248 |
]
|
| 2249 |
|
| 2250 |
[package.dependencies]
|
| 2251 |
beautifulsoup4 = "*"
|
| 2252 |
+
bleach = {version = "!=5.0.0", extras = ["css"]}
|
| 2253 |
defusedxml = "*"
|
| 2254 |
jinja2 = ">=3.0"
|
| 2255 |
jupyter-core = ">=4.7"
|
|
|
|
| 2261 |
packaging = "*"
|
| 2262 |
pandocfilters = ">=1.4.1"
|
| 2263 |
pygments = ">=2.4.1"
|
|
|
|
| 2264 |
traitlets = ">=5.1"
|
| 2265 |
|
| 2266 |
[package.extras]
|
|
|
|
| 3066 |
{file = "psutil-6.1.1-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4"},
|
| 3067 |
{file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468"},
|
| 3068 |
{file = "psutil-6.1.1-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca"},
|
|
|
|
|
|
|
| 3069 |
{file = "psutil-6.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8"},
|
| 3070 |
{file = "psutil-6.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377"},
|
| 3071 |
{file = "psutil-6.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003"},
|
|
|
|
| 3331 |
|
| 3332 |
[[package]]
|
| 3333 |
name = "pydantic-settings"
|
| 3334 |
+
version = "2.7.1"
|
| 3335 |
description = "Settings management using Pydantic"
|
| 3336 |
optional = false
|
| 3337 |
python-versions = ">=3.8"
|
| 3338 |
files = [
|
| 3339 |
+
{file = "pydantic_settings-2.7.1-py3-none-any.whl", hash = "sha256:590be9e6e24d06db33a4262829edef682500ef008565a969c73d39d5f8bfb3fd"},
|
| 3340 |
+
{file = "pydantic_settings-2.7.1.tar.gz", hash = "sha256:10c9caad35e64bfb3c2fbf70a078c0e25cc92499782e5200747f942a065dec93"},
|
| 3341 |
]
|
| 3342 |
|
| 3343 |
[package.dependencies]
|
|
|
|
| 3384 |
|
| 3385 |
[[package]]
|
| 3386 |
name = "pyparsing"
|
| 3387 |
+
version = "3.2.1"
|
| 3388 |
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
|
| 3389 |
optional = false
|
| 3390 |
python-versions = ">=3.9"
|
| 3391 |
files = [
|
| 3392 |
+
{file = "pyparsing-3.2.1-py3-none-any.whl", hash = "sha256:506ff4f4386c4cec0590ec19e6302d3aedb992fdc02c761e90416f158dacf8e1"},
|
| 3393 |
+
{file = "pyparsing-3.2.1.tar.gz", hash = "sha256:61980854fd66de3a90028d679a954d5f2623e83144b5afe5ee86f43d762e5f0a"},
|
| 3394 |
]
|
| 3395 |
|
| 3396 |
[package.extras]
|
|
|
|
| 4157 |
|
| 4158 |
[[package]]
|
| 4159 |
name = "safetensors"
|
| 4160 |
+
version = "0.5.0"
|
| 4161 |
description = ""
|
| 4162 |
optional = false
|
| 4163 |
python-versions = ">=3.7"
|
| 4164 |
files = [
|
| 4165 |
+
{file = "safetensors-0.5.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c683b9b485bee43422ba2855f72777c37647190281e03da4c8d2a69fa5336558"},
|
| 4166 |
+
{file = "safetensors-0.5.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6106aa835deb7263f7014f74c05842ab828d6c11d789f2e7e98f26b1a305e72d"},
|
| 4167 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1349611f74f55c5ee1c1c144c536a2743c38f7d8bf60b9fc8267e0efc0591a2"},
|
| 4168 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:56d936028ac799e18644b08a91fd98b4b62ae3dcd0440b1cfcb56535785589f1"},
|
| 4169 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a2f26afada2233576ffea6b80042c2c0a8105c164254af56168ec14299ad3122"},
|
| 4170 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20067e7a5e63f0cbc88457b2a1161e70ff73af4cc3a24bce90309430cd6f6e7e"},
|
| 4171 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:649d6a4aa34d5174ae87289068ccc2fec2a1a998ecf83425aa5a42c3eff69bcf"},
|
| 4172 |
+
{file = "safetensors-0.5.0-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:debff88f41d569a3e93a955469f83864e432af35bb34b16f65a9ddf378daa3ae"},
|
| 4173 |
+
{file = "safetensors-0.5.0-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:bdf6a3e366ea8ba1a0538db6099229e95811194432c684ea28ea7ae28763b8dc"},
|
| 4174 |
+
{file = "safetensors-0.5.0-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:0371afd84c200a80eb7103bf715108b0c3846132fb82453ae018609a15551580"},
|
| 4175 |
+
{file = "safetensors-0.5.0-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:5ec7fc8c3d2f32ebf1c7011bc886b362e53ee0a1ec6d828c39d531fed8b325d6"},
|
| 4176 |
+
{file = "safetensors-0.5.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:53715e4ea0ef23c08f004baae0f609a7773de7d4148727760417c6760cfd6b76"},
|
| 4177 |
+
{file = "safetensors-0.5.0-cp38-abi3-win32.whl", hash = "sha256:b85565bc2f0456961a788d2f11d9d892eec46603db0e4923aa9512c2355aa727"},
|
| 4178 |
+
{file = "safetensors-0.5.0-cp38-abi3-win_amd64.whl", hash = "sha256:f451941f8aa11e7be5c3fa450e264609a2b1e65fa38ae590a74e55a94d646b76"},
|
| 4179 |
+
{file = "safetensors-0.5.0.tar.gz", hash = "sha256:c47b34c549fa1e0c655c4644da31332c61332c732c47c8dd9399347e9aac69d1"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4180 |
]
|
| 4181 |
|
| 4182 |
[package.extras]
|
|
|
|
| 4186 |
mlx = ["mlx (>=0.0.9)"]
|
| 4187 |
numpy = ["numpy (>=1.21.6)"]
|
| 4188 |
paddlepaddle = ["paddlepaddle (>=2.4.1)", "safetensors[numpy]"]
|
| 4189 |
+
pinned-tf = ["safetensors[numpy]", "tensorflow (==2.18.0)"]
|
| 4190 |
quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"]
|
| 4191 |
tensorflow = ["safetensors[numpy]", "tensorflow (>=2.11.0)"]
|
| 4192 |
testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "hypothesis (>=6.70.2)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "safetensors[numpy]", "setuptools-rust (>=1.5.2)"]
|
|
|
|
| 4345 |
|
| 4346 |
[[package]]
|
| 4347 |
name = "smmap"
|
| 4348 |
+
version = "5.0.2"
|
| 4349 |
description = "A pure Python implementation of a sliding window memory map manager"
|
| 4350 |
optional = false
|
| 4351 |
python-versions = ">=3.7"
|
| 4352 |
files = [
|
| 4353 |
+
{file = "smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e"},
|
| 4354 |
+
{file = "smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5"},
|
| 4355 |
]
|
| 4356 |
|
| 4357 |
[[package]]
|
|
|
|
| 4449 |
|
| 4450 |
[[package]]
|
| 4451 |
name = "surya-ocr"
|
| 4452 |
+
version = "0.8.3"
|
| 4453 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4454 |
optional = false
|
| 4455 |
python-versions = "<4.0,>=3.10"
|
| 4456 |
files = [
|
| 4457 |
+
{file = "surya_ocr-0.8.3-py3-none-any.whl", hash = "sha256:b2a0e07de8741d2f1b68a1b9f33b6864779648619607ee09dcbeabc31ee79289"},
|
| 4458 |
+
{file = "surya_ocr-0.8.3.tar.gz", hash = "sha256:13d9ab7d5d971f16e37bffe48767b80df6999cccd3c7eb7c154f33f440ac02e3"},
|
| 4459 |
]
|
| 4460 |
|
| 4461 |
[package.dependencies]
|
| 4462 |
filetype = ">=1.2.0,<2.0.0"
|
| 4463 |
ftfy = ">=6.1.3,<7.0.0"
|
| 4464 |
opencv-python = ">=4.9.0.80,<5.0.0.0"
|
| 4465 |
+
pdftext = ">=0.4.1,<0.5.0"
|
| 4466 |
pillow = ">=10.2.0,<11.0.0"
|
| 4467 |
pydantic = ">=2.5.3,<3.0.0"
|
| 4468 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4469 |
+
pypdfium2 = "4.30.0"
|
| 4470 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4471 |
tabulate = ">=0.9.0,<0.10.0"
|
| 4472 |
torch = ">=2.4.1,<3.0.0"
|
|
|
|
| 5320 |
[metadata]
|
| 5321 |
lock-version = "2.0"
|
| 5322 |
python-versions = "^3.10"
|
| 5323 |
+
content-hash = "f10872bc2f59616bf1093839e7065c82f7d78a88d8db91fa3c05c2ec5f857e7f"
|
pyproject.toml
CHANGED
|
@@ -33,7 +33,7 @@ tabulate = "^0.9.0"
|
|
| 33 |
ftfy = "^6.1.1"
|
| 34 |
texify = "^0.2.1"
|
| 35 |
rapidfuzz = "^3.8.1"
|
| 36 |
-
surya-ocr = "~0.8.
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "~0.4.1"
|
| 39 |
tabled-pdf = "~0.2.0"
|
|
|
|
| 33 |
ftfy = "^6.1.1"
|
| 34 |
texify = "^0.2.1"
|
| 35 |
rapidfuzz = "^3.8.1"
|
| 36 |
+
surya-ocr = "~0.8.3"
|
| 37 |
regex = "^2024.4.28"
|
| 38 |
pdftext = "~0.4.1"
|
| 39 |
tabled-pdf = "~0.2.0"
|
tests/builders/test_strip_existing_ocr.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from marker.builders.document import DocumentBuilder
|
| 4 |
+
from marker.builders.layout import LayoutBuilder
|
| 5 |
+
from marker.schema import BlockTypes
|
| 6 |
+
|
| 7 |
+
@pytest.mark.config({"page_range": [0], "strip_existing_ocr": True})
|
| 8 |
+
@pytest.mark.filename("handwritten.pdf")
|
| 9 |
+
def test_strip_ocr(pdf_provider):
|
| 10 |
+
# Ensure that the OCR text isn't extracted
|
| 11 |
+
assert len(pdf_provider.page_lines) == 0
|
| 12 |
+
|
| 13 |
+
@pytest.mark.config({"page_range": [0]})
|
| 14 |
+
@pytest.mark.filename("handwritten.pdf")
|
| 15 |
+
def test_keep_ocr(pdf_provider):
|
| 16 |
+
assert len(pdf_provider.page_lines) == 1
|