Spaces:

rt4u
/

marker

Sleeping

App Files Files Community

Vik Paruchuri commited on May 27, 2024

Commit

1ec9d8a

1 Parent(s): 37ae777

Softer OCR heuristics, enable float batch multipliers

Browse files

Files changed (9) hide show

.gitignore +2 -0
marker/layout/layout.py +1 -1
marker/layout/order.py +1 -1
marker/ocr/detection.py +1 -1
marker/ocr/heuristics.py +3 -3
marker/ocr/recognition.py +1 -1
poetry.lock +22 -22
pyproject.toml +3 -3
scripts/verify_benchmark_scores.py +1 -1

.gitignore CHANGED Viewed

@@ -6,6 +6,8 @@ test_data
 training
 wandb
 *.dat
 # Byte-compiled / optimized / DLL files
 __pycache__/

 training
 wandb
 *.dat
+report.json
+benchmark_data
 # Byte-compiled / optimized / DLL files
 __pycache__/

marker/layout/layout.py CHANGED Viewed

@@ -21,7 +21,7 @@ def surya_layout(doc, pages: List[Page], layout_model, batch_multiplier=1):
     text_detection_results = [p.text_lines for p in pages]
     processor = layout_model.processor
-    layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=get_batch_size() * batch_multiplier)
     for page, layout_result in zip(pages, layout_results):
         page.layout = layout_result

     text_detection_results = [p.text_lines for p in pages]
     processor = layout_model.processor
+    layout_results = batch_layout_detection(images, layout_model, processor, detection_results=text_detection_results, batch_size=int(get_batch_size() * batch_multiplier))
     for page, layout_result in zip(pages, layout_results):
         page.layout = layout_result

marker/layout/order.py CHANGED Viewed

@@ -30,7 +30,7 @@ def surya_order(doc, pages: List[Page], order_model, batch_multiplier=1):
         bboxes.append(bbox)
     processor = order_model.processor
-    order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=get_batch_size() * batch_multiplier)
     for page, order_result in zip(pages, order_results):
         page.order = order_result

         bboxes.append(bbox)
     processor = order_model.processor
+    order_results = batch_ordering(images, bboxes, order_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
     for page, order_result in zip(pages, order_results):
         page.order = order_result

marker/ocr/detection.py CHANGED Viewed

@@ -21,7 +21,7 @@ def surya_detection(doc: PdfDocument, pages: List[Page], det_model, batch_multip
     max_len = min(len(pages), len(doc))
     images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
-    predictions = batch_text_detection(images, det_model, processor, batch_size=get_batch_size() * batch_multiplier)
     for (page, pred) in zip(pages, predictions):
         page.text_lines = pred

     max_len = min(len(pages), len(doc))
     images = [render_image(doc[pnum], dpi=settings.SURYA_DETECTOR_DPI) for pnum in range(max_len)]
+    predictions = batch_text_detection(images, det_model, processor, batch_size=int(get_batch_size() * batch_multiplier))
     for (page, pred) in zip(pages, predictions):
         page.text_lines = pred

marker/ocr/heuristics.py CHANGED Viewed

@@ -12,7 +12,7 @@ def should_ocr_page(page: Page, no_text: bool):
     # OCR page if we got minimal text, or if we got too many spaces
     conditions = [
-        no_text , # Full doc has no text, and needs full OCR
         (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)),  # Bad OCR
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
@@ -39,7 +39,7 @@ def detect_bad_ocr(text, space_threshold=.7, newline_threshold=.6, alphanum_thre
         return True
     invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
-    if invalid_chars > max(4.0, len(text) * .03):
         return True
     return False
@@ -52,7 +52,7 @@ def no_text_found(pages: List[Page]):
     return len(full_text.strip()) == 0
-def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.65):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:

     # OCR page if we got minimal text, or if we got too many spaces
     conditions = [
+        no_text, # Full doc has no text, and needs full OCR
         (len(page.prelim_text) > 0 and detect_bad_ocr(page.prelim_text)),  # Bad OCR
         detected_lines_found is False, # didn't extract text for all detected lines
     ]
         return True
     invalid_chars = len([c for c in text if c in settings.INVALID_CHARS])
+    if invalid_chars > max(6.0, len(text) * .03):
         return True
     return False
     return len(full_text.strip()) == 0
+def detected_line_coverage(page: Page, intersect_thresh=.5, detection_thresh=.4):
     found_lines = 0
     for detected_line in page.text_lines.bboxes:

marker/ocr/recognition.py CHANGED Viewed

@@ -83,7 +83,7 @@ def surya_recognition(doc, page_idxs, langs: List[str], rec_model, pages: List[P
     detection_results = [p.text_lines.bboxes for p in selected_pages]
     polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
-    results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=get_batch_size() * batch_multiplier)
     new_pages = []
     for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):

     detection_results = [p.text_lines.bboxes for p in selected_pages]
     polygons = [[b.polygon for b in bboxes] for bboxes in detection_results]
+    results = run_recognition(images, surya_langs, rec_model, processor, polygons=polygons, batch_size=int(get_batch_size() * batch_multiplier))
     new_pages = []
     for (page_idx, result, old_page) in zip(page_idxs, results, selected_pages):

poetry.lock CHANGED Viewed

@@ -27,13 +27,13 @@ files = [
 [[package]]
 name = "anyio"
-version = "4.3.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
-    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
 ]
 [package.dependencies]
@@ -841,13 +841,13 @@ socks = ["socksio (==1.*)"]
 [[package]]
 name = "huggingface-hub"
-version = "0.23.1"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.23.1-py3-none-any.whl", hash = "sha256:720a5bffd2b1b449deb793da8b0df7a9390a7e238534d5a08c9fbcdecb1dd3cb"},
-    {file = "huggingface_hub-0.23.1.tar.gz", hash = "sha256:4f62dbf6ae94f400c6d3419485e52bce510591432a5248a65d0cb72e4d479eb4"},
 ]
 [package.dependencies]
@@ -2007,13 +2007,13 @@ testing = ["docopt", "pytest"]
 [[package]]
 name = "pdftext"
-version = "0.3.8"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
-    {file = "pdftext-0.3.8-py3-none-any.whl", hash = "sha256:d11aeaf792b96ea878139ad7cd64a92d61cc5e01fec4f3b85ca6da1043d98cbe"},
-    {file = "pdftext-0.3.8.tar.gz", hash = "sha256:1fbf53f0dc636b6863ccbbb6aed693c0e435b531a55a58e3d23bd125a2e0c616"},
 ]
 [package.dependencies]
@@ -2154,13 +2154,13 @@ twisted = ["twisted"]
 [[package]]
 name = "prompt-toolkit"
-version = "3.0.43"
 description = "Library for building powerful interactive command lines in Python"
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "prompt_toolkit-3.0.43-py3-none-any.whl", hash = "sha256:a11a29cb3bf0a28a387fe5122cdb649816a957cd9261dcedf8c9f1fef33eacf6"},
-    {file = "prompt_toolkit-3.0.43.tar.gz", hash = "sha256:3527b7af26106cbc65a040bcc84839a3566ec1b051bb0bfe953631e704b0ff7d"},
 ]
 [package.dependencies]
@@ -3379,13 +3379,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"]
 [[package]]
 name = "surya-ocr"
-version = "0.4.8"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
-    {file = "surya_ocr-0.4.8-py3-none-any.whl", hash = "sha256:6753bf295581f44b3e3452de563a3730a6c91500ea09090927154a1edfe57364"},
-    {file = "surya_ocr-0.4.8.tar.gz", hash = "sha256:01e97db0d43941637ff0ddededa46491f7b0b937dba5c7fbba4ee75177991465"},
 ]
 [package.dependencies]
@@ -3853,13 +3853,13 @@ files = [
 [[package]]
 name = "typing-extensions"
-version = "4.11.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
-    {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
 ]
 [[package]]
@@ -3959,13 +3959,13 @@ files = [
 [[package]]
 name = "zipp"
-version = "3.18.2"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.18.2-py3-none-any.whl", hash = "sha256:dce197b859eb796242b0622af1b8beb0a722d52aa2f57133ead08edd5bf5374e"},
-    {file = "zipp-3.18.2.tar.gz", hash = "sha256:6278d9ddbcfb1f1089a88fde84481528b07b0e10474e09dcfe53dad4069fa059"},
 ]
 [package.extras]
@@ -3975,4 +3975,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
-content-hash = "5b18da49116103a0e6f69520268d063745c4994911140769bd8d41a1af9b1beb"

 [[package]]
 name = "anyio"
+version = "4.4.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
 files = [
+    {file = "anyio-4.4.0-py3-none-any.whl", hash = "sha256:c1b2d8f46a8a812513012e1107cb0e68c17159a7a594208005a57dc776e1bdc7"},
+    {file = "anyio-4.4.0.tar.gz", hash = "sha256:5aadc6a1bbb7cdb0bede386cac5e2940f5e2ff3aa20277e991cf028e0585ce94"},
 ]
 [package.dependencies]
 [[package]]
 name = "huggingface-hub"
+version = "0.23.2"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
+    {file = "huggingface_hub-0.23.2-py3-none-any.whl", hash = "sha256:48727a16e704d409c4bb5913613308499664f22a99743435dc3a13b23c485827"},
+    {file = "huggingface_hub-0.23.2.tar.gz", hash = "sha256:f6829b62d5fdecb452a76fdbec620cba4c1573655a8d710c1df71735fd9edbd2"},
 ]
 [package.dependencies]
 [[package]]
 name = "pdftext"
+version = "0.3.10"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
+    {file = "pdftext-0.3.10-py3-none-any.whl", hash = "sha256:99bd900d0d0692df06719c07ce10a859750ade3eb7f10c543f637118417497f9"},
+    {file = "pdftext-0.3.10.tar.gz", hash = "sha256:90de726e818fb5683a0616cabb1a75a32a7224e873c3058006c93da6e440c66c"},
 ]
 [package.dependencies]
 [[package]]
 name = "prompt-toolkit"
+version = "3.0.44"
 description = "Library for building powerful interactive command lines in Python"
 optional = false
 python-versions = ">=3.7.0"
 files = [
+    {file = "prompt_toolkit-3.0.44-py3-none-any.whl", hash = "sha256:205a20669633d042d3722a528b8e7cd3f4dbd9e1450935f596c2cc61166762dd"},
+    {file = "prompt_toolkit-3.0.44.tar.gz", hash = "sha256:c1dfd082c4259964bc8bcce1f8460d9dbeb5d4a37bfc25b8082bc02cd41c8af6"},
 ]
 [package.dependencies]
 [[package]]
 name = "surya-ocr"
+version = "0.4.10"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 optional = false
 python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9"
 files = [
+    {file = "surya_ocr-0.4.10-py3-none-any.whl", hash = "sha256:18236c422b3855a1f6ece34f96137afd70d78078edc4ae002f972580f37918bb"},
+    {file = "surya_ocr-0.4.10.tar.gz", hash = "sha256:a5ab764c6797e41854aed3e462a526361cf130c0c5da0208575152eefb762685"},
 ]
 [package.dependencies]
 [[package]]
 name = "typing-extensions"
+version = "4.12.0"
 description = "Backported and Experimental Type Hints for Python 3.8+"
 optional = false
 python-versions = ">=3.8"
 files = [
+    {file = "typing_extensions-4.12.0-py3-none-any.whl", hash = "sha256:b349c66bea9016ac22978d800cfff206d5f9816951f12a7d0ec5578b0a819594"},
+    {file = "typing_extensions-4.12.0.tar.gz", hash = "sha256:8cbcdc8606ebcb0d95453ad7dc5065e6237b6aa230a31e81d0f440c30fed5fd8"},
 ]
 [[package]]
 [[package]]
 name = "zipp"
+version = "3.19.0"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
 python-versions = ">=3.8"
 files = [
+    {file = "zipp-3.19.0-py3-none-any.whl", hash = "sha256:96dc6ad62f1441bcaccef23b274ec471518daf4fbbc580341204936a5a3dddec"},
+    {file = "zipp-3.19.0.tar.gz", hash = "sha256:952df858fb3164426c976d9338d3961e8e8b3758e2e059e0f754b8c4262625ee"},
 ]
 [package.extras]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13,!=3.9.7"
+content-hash = "fa892a80f72a88ccd0cb9d5e7d1a115f53eb2f19ddd3a5e502e5f57e3d9d2af3"

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.2.9"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -33,10 +33,10 @@ tabulate = "^0.9.0"
 ftfy = "^6.1.1"
 texify = "^0.1.9"
 rapidfuzz = "^3.8.1"
-surya-ocr = "^0.4.8"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
-pdftext = "^0.3.8"
 grpcio = "^1.63.0"
 [tool.poetry.group.dev.dependencies]

 [tool.poetry]
 name = "marker-pdf"
+version = "0.2.10"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
 ftfy = "^6.1.1"
 texify = "^0.1.9"
 rapidfuzz = "^3.8.1"
+surya-ocr = "^0.4.10"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
+pdftext = "^0.3.10"
 grpcio = "^1.63.0"
 [tool.poetry.group.dev.dependencies]

scripts/verify_benchmark_scores.py CHANGED Viewed

@@ -9,7 +9,7 @@ def verify_scores(file_path):
     multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
     switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
-    if multicolcnn_score <= 0.4 or switch_trans_score <= 0.4:
         raise ValueError("One or more scores are below the required threshold of 0.4")

     multicolcnn_score = data["marker"]["files"]["multicolcnn.pdf"]["score"]
     switch_trans_score = data["marker"]["files"]["switch_trans.pdf"]["score"]
+    if multicolcnn_score <= 0.39 or switch_trans_score <= 0.4:
         raise ValueError("One or more scores are below the required threshold of 0.4")