Vik Paruchuri commited on
Commit
457f524
·
1 Parent(s): f7db972

Update surya dep

Browse files
Files changed (2) hide show
  1. README.md +6 -1
  2. poetry.lock +9 -10
README.md CHANGED
@@ -219,7 +219,12 @@ rendered = converter("FILEPATH")
219
  text, _, images = text_from_rendered(rendered)
220
  ```
221
 
222
- This takes all the same configuration as the PdfConverter. You can specify the configuration `force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
 
 
 
 
 
223
 
224
  # Output Formats
225
 
 
219
  text, _, images = text_from_rendered(rendered)
220
  ```
221
 
222
+ This takes all the same configuration as the PdfConverter. You can specify the configuration `--force_layout_block=Table` to avoid layout detection and instead assume every page is a table.
223
+
224
+ You can also run this via the CLI with
225
+ ```shell
226
+ python convert_single.py FILENAME --use_llm --force_layout_block Table --converter_cls marker.converters.table.TableConverter
227
+ ```
228
 
229
  # Output Formats
230
 
poetry.lock CHANGED
@@ -2923,16 +2923,17 @@ testing = ["docopt", "pytest"]
2923
 
2924
  [[package]]
2925
  name = "pdftext"
2926
- version = "0.4.1"
2927
  description = "Extract structured text from pdfs quickly"
2928
  optional = false
2929
  python-versions = "<4.0,>=3.10"
2930
  files = [
2931
- {file = "pdftext-0.4.1-py3-none-any.whl", hash = "sha256:c25514f7a9ded34f68c8d28511fd78d7586a43d0cf5ef7d6bc33c476fa55fd1f"},
2932
- {file = "pdftext-0.4.1.tar.gz", hash = "sha256:ae06f3c0844e7cc631af86b844f4af06b72da2b67d7450441ead258a64e98660"},
2933
  ]
2934
 
2935
  [package.dependencies]
 
2936
  pydantic = ">=2.7.1,<3.0.0"
2937
  pydantic-settings = ">=2.2.1,<3.0.0"
2938
  pypdfium2 = "4.30.0"
@@ -4638,26 +4639,24 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
4638
 
4639
  [[package]]
4640
  name = "surya-ocr"
4641
- version = "0.8.3"
4642
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4643
  optional = false
4644
  python-versions = "<4.0,>=3.10"
4645
  files = [
4646
- {file = "surya_ocr-0.8.3-py3-none-any.whl", hash = "sha256:b2a0e07de8741d2f1b68a1b9f33b6864779648619607ee09dcbeabc31ee79289"},
4647
- {file = "surya_ocr-0.8.3.tar.gz", hash = "sha256:13d9ab7d5d971f16e37bffe48767b80df6999cccd3c7eb7c154f33f440ac02e3"},
4648
  ]
4649
 
4650
  [package.dependencies]
 
4651
  filetype = ">=1.2.0,<2.0.0"
4652
- ftfy = ">=6.1.3,<7.0.0"
4653
  opencv-python = ">=4.9.0.80,<5.0.0.0"
4654
- pdftext = ">=0.4.1,<0.5.0"
4655
  pillow = ">=10.2.0,<11.0.0"
4656
  pydantic = ">=2.5.3,<3.0.0"
4657
  pydantic-settings = ">=2.1.0,<3.0.0"
4658
  pypdfium2 = "4.30.0"
4659
  python-dotenv = ">=1.0.0,<2.0.0"
4660
- tabulate = ">=0.9.0,<0.10.0"
4661
  torch = ">=2.4.1,<3.0.0"
4662
  transformers = ">=4.41.0,<5.0.0"
4663
 
@@ -5488,4 +5487,4 @@ propcache = ">=0.2.0"
5488
  [metadata]
5489
  lock-version = "2.0"
5490
  python-versions = "^3.10"
5491
- content-hash = "58505c3f91b4bc225c36c0901cd4d6162e2892b47850dd8e11c4c12568ab19d3"
 
2923
 
2924
  [[package]]
2925
  name = "pdftext"
2926
+ version = "0.5.0"
2927
  description = "Extract structured text from pdfs quickly"
2928
  optional = false
2929
  python-versions = "<4.0,>=3.10"
2930
  files = [
2931
+ {file = "pdftext-0.5.0-py3-none-any.whl", hash = "sha256:e14179c5039c711dc5c490ecb1bc15c92ab920e5f7715034b7ae5a387b3b2787"},
2932
+ {file = "pdftext-0.5.0.tar.gz", hash = "sha256:f6487d170abc97867d7539774fecdb0a17599965ba88287b3b89731f5cd7d612"},
2933
  ]
2934
 
2935
  [package.dependencies]
2936
+ click = ">=8.1.8,<9.0.0"
2937
  pydantic = ">=2.7.1,<3.0.0"
2938
  pydantic-settings = ">=2.2.1,<3.0.0"
2939
  pypdfium2 = "4.30.0"
 
4639
 
4640
  [[package]]
4641
  name = "surya-ocr"
4642
+ version = "0.9.0"
4643
  description = "OCR, layout, reading order, and table recognition in 90+ languages"
4644
  optional = false
4645
  python-versions = "<4.0,>=3.10"
4646
  files = [
4647
+ {file = "surya_ocr-0.9.0-py3-none-any.whl", hash = "sha256:1180f504ff9aea3a9992b3ae64eb638d72ed69237baa0550ccb0f62766d3f4e6"},
4648
+ {file = "surya_ocr-0.9.0.tar.gz", hash = "sha256:cd70b55b4d320443ff1b974899e8495279881c4fa7406cc7f243d49b6c73b87d"},
4649
  ]
4650
 
4651
  [package.dependencies]
4652
+ click = ">=8.1.8,<9.0.0"
4653
  filetype = ">=1.2.0,<2.0.0"
 
4654
  opencv-python = ">=4.9.0.80,<5.0.0.0"
 
4655
  pillow = ">=10.2.0,<11.0.0"
4656
  pydantic = ">=2.5.3,<3.0.0"
4657
  pydantic-settings = ">=2.1.0,<3.0.0"
4658
  pypdfium2 = "4.30.0"
4659
  python-dotenv = ">=1.0.0,<2.0.0"
 
4660
  torch = ">=2.4.1,<3.0.0"
4661
  transformers = ">=4.41.0,<5.0.0"
4662
 
 
5487
  [metadata]
5488
  lock-version = "2.0"
5489
  python-versions = "^3.10"
5490
+ content-hash = "f8d6cc52210b2d55a576c13c73deafebd70392f1d48d73a905867e082301ae40"