Vik Paruchuri
commited on
Commit
·
6fa9fe6
1
Parent(s):
d6bdda8
Set OCR engine to None
Browse files- .github/workflows/tests.yml +1 -1
- README.md +1 -1
- marker/ocr/recognition.py +1 -1
- poetry.lock +0 -0
- pyproject.toml +2 -0
.github/workflows/tests.yml
CHANGED
|
@@ -32,7 +32,7 @@ jobs:
|
|
| 32 |
- name: Run table benchmark
|
| 33 |
run: |
|
| 34 |
poetry run python benchmarks/table.py tables.json
|
| 35 |
-
poetry run python scripts/verify_benchmark_scores.py
|
| 36 |
|
| 37 |
|
| 38 |
|
|
|
|
| 32 |
- name: Run table benchmark
|
| 33 |
run: |
|
| 34 |
poetry run python benchmarks/table.py tables.json
|
| 35 |
+
poetry run python scripts/verify_benchmark_scores.py tables.json --type table
|
| 36 |
|
| 37 |
|
| 38 |
|
README.md
CHANGED
|
@@ -141,7 +141,7 @@ MIN_LENGTH=10000 METADATA_FILE=../pdf_meta.json NUM_DEVICES=4 NUM_WORKERS=15 mar
|
|
| 141 |
|
| 142 |
- `METADATA_FILE` is an optional path to a json file with metadata about the pdfs. See above for the format.
|
| 143 |
- `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater.
|
| 144 |
-
- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
|
| 145 |
- `MIN_LENGTH` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
| 146 |
|
| 147 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
|
|
|
| 141 |
|
| 142 |
- `METADATA_FILE` is an optional path to a json file with metadata about the pdfs. See above for the format.
|
| 143 |
- `NUM_DEVICES` is the number of GPUs to use. Should be `2` or greater.
|
| 144 |
+
- `NUM_WORKERS` is the number of parallel processes to run on each GPU.
|
| 145 |
- `MIN_LENGTH` is the minimum number of characters that need to be extracted from a pdf before it will be considered for processing. If you're processing a lot of pdfs, I recommend setting this to avoid OCRing pdfs that are mostly images. (slows everything down)
|
| 146 |
|
| 147 |
Note that the env variables above are specific to this script, and cannot be set in `local.env`.
|
marker/ocr/recognition.py
CHANGED
|
@@ -45,7 +45,7 @@ def run_ocr(doc, pages: List[Page], langs: List[str], rec_model, batch_multiplie
|
|
| 45 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 46 |
|
| 47 |
ocr_method = settings.OCR_ENGINE
|
| 48 |
-
if ocr_method is None:
|
| 49 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 50 |
elif ocr_method == "surya":
|
| 51 |
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
|
|
|
|
| 45 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 46 |
|
| 47 |
ocr_method = settings.OCR_ENGINE
|
| 48 |
+
if ocr_method is None or ocr_method == "None":
|
| 49 |
return pages, {"ocr_pages": 0, "ocr_failed": 0, "ocr_success": 0, "ocr_engine": "none"}
|
| 50 |
elif ocr_method == "surya":
|
| 51 |
new_pages = surya_recognition(doc, ocr_idxs, langs, rec_model, pages, batch_multiplier=batch_multiplier)
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -42,6 +42,8 @@ grpcio = "^1.63.0"
|
|
| 42 |
|
| 43 |
[tool.poetry.group.dev.dependencies]
|
| 44 |
jupyter = "^1.0.0"
|
|
|
|
|
|
|
| 45 |
|
| 46 |
[tool.poetry.scripts]
|
| 47 |
marker = "convert:main"
|
|
|
|
| 42 |
|
| 43 |
[tool.poetry.group.dev.dependencies]
|
| 44 |
jupyter = "^1.0.0"
|
| 45 |
+
datasets = "^2.21.0"
|
| 46 |
+
streamlit = "^1.37.1"
|
| 47 |
|
| 48 |
[tool.poetry.scripts]
|
| 49 |
marker = "convert:main"
|