Vik Paruchuri
commited on
Commit
·
457f524
1
Parent(s):
f7db972
Update surya dep
Browse files- README.md +6 -1
- poetry.lock +9 -10
README.md
CHANGED
|
@@ -219,7 +219,12 @@ rendered = converter("FILEPATH")
|
|
| 219 |
text, _, images = text_from_rendered(rendered)
|
| 220 |
```
|
| 221 |
|
| 222 |
-
This takes all the same configuration as the PdfConverter. You can specify the configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
# Output Formats
|
| 225 |
|
|
|
|
| 219 |
text, _, images = text_from_rendered(rendered)
|
| 220 |
```
|
| 221 |
|
| 222 |
+
This takes all the same configuration as the PdfConverter. You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
|
| 223 |
+
|
| 224 |
+
You can also run this via the CLI with
|
| 225 |
+
```shell
|
| 226 |
+
python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
|
| 227 |
+
```
|
| 228 |
|
| 229 |
# Output Formats
|
| 230 |
|
poetry.lock
CHANGED
|
@@ -2923,16 +2923,17 @@ testing = ["docopt", "pytest"]
|
|
| 2923 |
|
| 2924 |
[[package]]
|
| 2925 |
name = "pdftext"
|
| 2926 |
-
version = "0.
|
| 2927 |
description = "Extract structured text from pdfs quickly"
|
| 2928 |
optional = false
|
| 2929 |
python-versions = "<4.0,>=3.10"
|
| 2930 |
files = [
|
| 2931 |
-
{file = "pdftext-0.
|
| 2932 |
-
{file = "pdftext-0.
|
| 2933 |
]
|
| 2934 |
|
| 2935 |
[package.dependencies]
|
|
|
|
| 2936 |
pydantic = ">=2.7.1,<3.0.0"
|
| 2937 |
pydantic-settings = ">=2.2.1,<3.0.0"
|
| 2938 |
pypdfium2 = "4.30.0"
|
|
@@ -4638,26 +4639,24 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
|
|
| 4638 |
|
| 4639 |
[[package]]
|
| 4640 |
name = "surya-ocr"
|
| 4641 |
-
version = "0.
|
| 4642 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4643 |
optional = false
|
| 4644 |
python-versions = "<4.0,>=3.10"
|
| 4645 |
files = [
|
| 4646 |
-
{file = "surya_ocr-0.
|
| 4647 |
-
{file = "surya_ocr-0.
|
| 4648 |
]
|
| 4649 |
|
| 4650 |
[package.dependencies]
|
|
|
|
| 4651 |
filetype = ">=1.2.0,<2.0.0"
|
| 4652 |
-
ftfy = ">=6.1.3,<7.0.0"
|
| 4653 |
opencv-python = ">=4.9.0.80,<5.0.0.0"
|
| 4654 |
-
pdftext = ">=0.4.1,<0.5.0"
|
| 4655 |
pillow = ">=10.2.0,<11.0.0"
|
| 4656 |
pydantic = ">=2.5.3,<3.0.0"
|
| 4657 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4658 |
pypdfium2 = "4.30.0"
|
| 4659 |
python-dotenv = ">=1.0.0,<2.0.0"
|
| 4660 |
-
tabulate = ">=0.9.0,<0.10.0"
|
| 4661 |
torch = ">=2.4.1,<3.0.0"
|
| 4662 |
transformers = ">=4.41.0,<5.0.0"
|
| 4663 |
|
|
@@ -5488,4 +5487,4 @@ propcache = ">=0.2.0"
|
|
| 5488 |
[metadata]
|
| 5489 |
lock-version = "2.0"
|
| 5490 |
python-versions = "^3.10"
|
| 5491 |
-
content-hash = "
|
|
|
|
| 2923 |
|
| 2924 |
[[package]]
|
| 2925 |
name = "pdftext"
|
| 2926 |
+
version = "0.5.0"
|
| 2927 |
description = "Extract structured text from pdfs quickly"
|
| 2928 |
optional = false
|
| 2929 |
python-versions = "<4.0,>=3.10"
|
| 2930 |
files = [
|
| 2931 |
+
{file = "pdftext-0.5.0-py3-none-any.whl", hash = "sha256:e14179c5039c711dc5c490ecb1bc15c92ab920e5f7715034b7ae5a387b3b2787"},
|
| 2932 |
+
{file = "pdftext-0.5.0.tar.gz", hash = "sha256:f6487d170abc97867d7539774fecdb0a17599965ba88287b3b89731f5cd7d612"},
|
| 2933 |
]
|
| 2934 |
|
| 2935 |
[package.dependencies]
|
| 2936 |
+
click = ">=8.1.8,<9.0.0"
|
| 2937 |
pydantic = ">=2.7.1,<3.0.0"
|
| 2938 |
pydantic-settings = ">=2.2.1,<3.0.0"
|
| 2939 |
pypdfium2 = "4.30.0"
|
|
|
|
| 4639 |
|
| 4640 |
[[package]]
|
| 4641 |
name = "surya-ocr"
|
| 4642 |
+
version = "0.9.0"
|
| 4643 |
description = "OCR, layout, reading order, and table recognition in 90+ languages"
|
| 4644 |
optional = false
|
| 4645 |
python-versions = "<4.0,>=3.10"
|
| 4646 |
files = [
|
| 4647 |
+
{file = "surya_ocr-0.9.0-py3-none-any.whl", hash = "sha256:1180f504ff9aea3a9992b3ae64eb638d72ed69237baa0550ccb0f62766d3f4e6"},
|
| 4648 |
+
{file = "surya_ocr-0.9.0.tar.gz", hash = "sha256:cd70b55b4d320443ff1b974899e8495279881c4fa7406cc7f243d49b6c73b87d"},
|
| 4649 |
]
|
| 4650 |
|
| 4651 |
[package.dependencies]
|
| 4652 |
+
click = ">=8.1.8,<9.0.0"
|
| 4653 |
filetype = ">=1.2.0,<2.0.0"
|
|
|
|
| 4654 |
opencv-python = ">=4.9.0.80,<5.0.0.0"
|
|
|
|
| 4655 |
pillow = ">=10.2.0,<11.0.0"
|
| 4656 |
pydantic = ">=2.5.3,<3.0.0"
|
| 4657 |
pydantic-settings = ">=2.1.0,<3.0.0"
|
| 4658 |
pypdfium2 = "4.30.0"
|
| 4659 |
python-dotenv = ">=1.0.0,<2.0.0"
|
|
|
|
| 4660 |
torch = ">=2.4.1,<3.0.0"
|
| 4661 |
transformers = ">=4.41.0,<5.0.0"
|
| 4662 |
|
|
|
|
| 5487 |
[metadata]
|
| 5488 |
lock-version = "2.0"
|
| 5489 |
python-versions = "^3.10"
|
| 5490 |
+
content-hash = "f8d6cc52210b2d55a576c13c73deafebd70392f1d48d73a905867e082301ae40"
|