Vik Paruchuri
commited on
Commit
·
65d4b3a
1
Parent(s):
7fd45b1
Update examples, OCR fix
Browse files- README.md +1 -1
- marker/builders/llm_layout.py +1 -11
- marker/builders/ocr.py +1 -1
- marker/processors/blockquote.py +1 -1
- marker/processors/llm/__init__.py +0 -8
- marker/services/__init__.py +12 -3
- marker/services/gemini.py +10 -4
- marker/services/ollama.py +2 -3
- marker/services/vertex.py +1 -1
- pyproject.toml +1 -1
README.md
CHANGED
|
@@ -22,7 +22,7 @@ See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instruc
|
|
| 22 |
|
| 23 |
## Hybrid Mode
|
| 24 |
|
| 25 |
-
For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any
|
| 26 |
|
| 27 |
Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
|
| 28 |
|
|
|
|
| 22 |
|
| 23 |
## Hybrid Mode
|
| 24 |
|
| 25 |
+
For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details.
|
| 26 |
|
| 27 |
Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
|
| 28 |
|
marker/builders/llm_layout.py
CHANGED
|
@@ -37,18 +37,10 @@ class LLMLayoutBuilder(LayoutBuilder):
|
|
| 37 |
str,
|
| 38 |
"The name of the Gemini model to use.",
|
| 39 |
] = "gemini-2.0-flash"
|
| 40 |
-
max_retries: Annotated[
|
| 41 |
-
int,
|
| 42 |
-
"The maximum number of retries to use for the Gemini model.",
|
| 43 |
-
] = 2
|
| 44 |
max_concurrency: Annotated[
|
| 45 |
int,
|
| 46 |
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 47 |
] = 3
|
| 48 |
-
timeout: Annotated[
|
| 49 |
-
int,
|
| 50 |
-
"The timeout for requests to the Gemini model.",
|
| 51 |
-
] = 60
|
| 52 |
disable_tqdm: Annotated[
|
| 53 |
bool,
|
| 54 |
"Whether to disable the tqdm progress bar.",
|
|
@@ -162,9 +154,7 @@ Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form
|
|
| 162 |
prompt,
|
| 163 |
image,
|
| 164 |
block,
|
| 165 |
-
LayoutSchema
|
| 166 |
-
max_retries=self.max_retries,
|
| 167 |
-
timeout=self.timeout
|
| 168 |
)
|
| 169 |
generated_label = None
|
| 170 |
if response and "label" in response:
|
|
|
|
| 37 |
str,
|
| 38 |
"The name of the Gemini model to use.",
|
| 39 |
] = "gemini-2.0-flash"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
max_concurrency: Annotated[
|
| 41 |
int,
|
| 42 |
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 43 |
] = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
disable_tqdm: Annotated[
|
| 45 |
bool,
|
| 46 |
"Whether to disable the tqdm progress bar.",
|
|
|
|
| 154 |
prompt,
|
| 155 |
image,
|
| 156 |
block,
|
| 157 |
+
LayoutSchema
|
|
|
|
|
|
|
| 158 |
)
|
| 159 |
generated_label = None
|
| 160 |
if response and "label" in response:
|
marker/builders/ocr.py
CHANGED
|
@@ -81,7 +81,7 @@ class OcrBuilder(BaseBuilder):
|
|
| 81 |
recognition_results = self.recognition_model(
|
| 82 |
images=images,
|
| 83 |
bboxes=line_boxes,
|
| 84 |
-
langs=[self.languages] * len(
|
| 85 |
recognition_batch_size=int(self.get_recognition_batch_size()),
|
| 86 |
sort_lines=False
|
| 87 |
)
|
|
|
|
| 81 |
recognition_results = self.recognition_model(
|
| 82 |
images=images,
|
| 83 |
bboxes=line_boxes,
|
| 84 |
+
langs=[self.languages] * len(pages),
|
| 85 |
recognition_batch_size=int(self.get_recognition_batch_size()),
|
| 86 |
sort_lines=False
|
| 87 |
)
|
marker/processors/blockquote.py
CHANGED
|
@@ -17,7 +17,7 @@ class BlockquoteProcessor(BaseProcessor):
|
|
| 17 |
float,
|
| 18 |
"The minimum horizontal indentation required to consider a block as part of a blockquote.",
|
| 19 |
"Expressed as a percentage of the block width.",
|
| 20 |
-
] = 0.
|
| 21 |
x_start_tolerance: Annotated[
|
| 22 |
float,
|
| 23 |
"The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
|
|
|
|
| 17 |
float,
|
| 18 |
"The minimum horizontal indentation required to consider a block as part of a blockquote.",
|
| 19 |
"Expressed as a percentage of the block width.",
|
| 20 |
+
] = 0.1
|
| 21 |
x_start_tolerance: Annotated[
|
| 22 |
float,
|
| 23 |
"The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
|
marker/processors/llm/__init__.py
CHANGED
|
@@ -42,18 +42,10 @@ class BaseLLMProcessor(BaseProcessor):
|
|
| 42 |
str,
|
| 43 |
"The name of the Gemini model to use.",
|
| 44 |
] = "gemini-2.0-flash"
|
| 45 |
-
max_retries: Annotated[
|
| 46 |
-
int,
|
| 47 |
-
"The maximum number of retries to use for the Gemini model.",
|
| 48 |
-
] = 1
|
| 49 |
max_concurrency: Annotated[
|
| 50 |
int,
|
| 51 |
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 52 |
] = 3
|
| 53 |
-
timeout: Annotated[
|
| 54 |
-
int,
|
| 55 |
-
"The timeout for requests to the Gemini model.",
|
| 56 |
-
] = 20
|
| 57 |
image_expansion_ratio: Annotated[
|
| 58 |
float,
|
| 59 |
"The ratio to expand the image by when cropping.",
|
|
|
|
| 42 |
str,
|
| 43 |
"The name of the Gemini model to use.",
|
| 44 |
] = "gemini-2.0-flash"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
max_concurrency: Annotated[
|
| 46 |
int,
|
| 47 |
"The maximum number of concurrent requests to make to the Gemini model.",
|
| 48 |
] = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
image_expansion_ratio: Annotated[
|
| 50 |
float,
|
| 51 |
"The ratio to expand the image by when cropping.",
|
marker/services/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import Optional, List
|
| 2 |
|
| 3 |
import PIL
|
| 4 |
from pydantic import BaseModel
|
|
@@ -8,6 +8,15 @@ from marker.util import assign_config, verify_config_keys
|
|
| 8 |
|
| 9 |
|
| 10 |
class BaseService:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 12 |
assign_config(self, config)
|
| 13 |
|
|
@@ -20,7 +29,7 @@ class BaseService:
|
|
| 20 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 21 |
block: Block,
|
| 22 |
response_schema: type[BaseModel],
|
| 23 |
-
max_retries: int =
|
| 24 |
-
timeout: int =
|
| 25 |
):
|
| 26 |
raise NotImplementedError
|
|
|
|
| 1 |
+
from typing import Optional, List, Annotated
|
| 2 |
|
| 3 |
import PIL
|
| 4 |
from pydantic import BaseModel
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class BaseService:
|
| 11 |
+
timeout: Annotated[
|
| 12 |
+
int,
|
| 13 |
+
"The timeout to use for the service."
|
| 14 |
+
] = 15
|
| 15 |
+
max_retries: Annotated[
|
| 16 |
+
int,
|
| 17 |
+
"The maximum number of retries to use for the service."
|
| 18 |
+
] = 1
|
| 19 |
+
|
| 20 |
def __init__(self, config: Optional[BaseModel | dict] = None):
|
| 21 |
assign_config(self, config)
|
| 22 |
|
|
|
|
| 29 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 30 |
block: Block,
|
| 31 |
response_schema: type[BaseModel],
|
| 32 |
+
max_retries: int | None = None,
|
| 33 |
+
timeout: int | None = None
|
| 34 |
):
|
| 35 |
raise NotImplementedError
|
marker/services/gemini.py
CHANGED
|
@@ -23,7 +23,7 @@ class BaseGeminiService(BaseService):
|
|
| 23 |
img.save(image_bytes, format="PNG")
|
| 24 |
return image_bytes.getvalue()
|
| 25 |
|
| 26 |
-
def get_google_client(self, timeout: int
|
| 27 |
raise NotImplementedError
|
| 28 |
|
| 29 |
def __call__(
|
|
@@ -32,9 +32,15 @@ class BaseGeminiService(BaseService):
|
|
| 32 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 33 |
block: Block,
|
| 34 |
response_schema: type[BaseModel],
|
| 35 |
-
max_retries: int =
|
| 36 |
-
timeout: int =
|
| 37 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if not isinstance(image, list):
|
| 39 |
image = [image]
|
| 40 |
|
|
@@ -80,7 +86,7 @@ class GoogleGeminiService(BaseGeminiService):
|
|
| 80 |
"The Google API key to use for the service."
|
| 81 |
] = None
|
| 82 |
|
| 83 |
-
def get_google_client(self, timeout: int
|
| 84 |
return genai.Client(
|
| 85 |
api_key=self.gemini_api_key,
|
| 86 |
http_options={"timeout": timeout * 1000} # Convert to milliseconds
|
|
|
|
| 23 |
img.save(image_bytes, format="PNG")
|
| 24 |
return image_bytes.getvalue()
|
| 25 |
|
| 26 |
+
def get_google_client(self, timeout: int):
|
| 27 |
raise NotImplementedError
|
| 28 |
|
| 29 |
def __call__(
|
|
|
|
| 32 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 33 |
block: Block,
|
| 34 |
response_schema: type[BaseModel],
|
| 35 |
+
max_retries: int | None = None,
|
| 36 |
+
timeout: int | None = None
|
| 37 |
):
|
| 38 |
+
if max_retries is None:
|
| 39 |
+
max_retries = self.max_retries
|
| 40 |
+
|
| 41 |
+
if timeout is None:
|
| 42 |
+
timeout = self.timeout
|
| 43 |
+
|
| 44 |
if not isinstance(image, list):
|
| 45 |
image = [image]
|
| 46 |
|
|
|
|
| 86 |
"The Google API key to use for the service."
|
| 87 |
] = None
|
| 88 |
|
| 89 |
+
def get_google_client(self, timeout: int):
|
| 90 |
return genai.Client(
|
| 91 |
api_key=self.gemini_api_key,
|
| 92 |
http_options={"timeout": timeout * 1000} # Convert to milliseconds
|
marker/services/ollama.py
CHANGED
|
@@ -32,8 +32,8 @@ class OllamaService(BaseService):
|
|
| 32 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 33 |
block: Block,
|
| 34 |
response_schema: type[BaseModel],
|
| 35 |
-
max_retries: int =
|
| 36 |
-
timeout: int =
|
| 37 |
):
|
| 38 |
url = f"{self.ollama_base_url}/api/generate"
|
| 39 |
headers = {"Content-Type": "application/json"}
|
|
@@ -63,7 +63,6 @@ class OllamaService(BaseService):
|
|
| 63 |
response.raise_for_status()
|
| 64 |
response_data = response.json()
|
| 65 |
data = response_data["response"]
|
| 66 |
-
print(data)
|
| 67 |
return json.loads(data)
|
| 68 |
except Exception as e:
|
| 69 |
print(f"Ollama inference failed: {e}")
|
|
|
|
| 32 |
image: PIL.Image.Image | List[PIL.Image.Image],
|
| 33 |
block: Block,
|
| 34 |
response_schema: type[BaseModel],
|
| 35 |
+
max_retries: int | None = None,
|
| 36 |
+
timeout: int | None = None
|
| 37 |
):
|
| 38 |
url = f"{self.ollama_base_url}/api/generate"
|
| 39 |
headers = {"Content-Type": "application/json"}
|
|
|
|
| 63 |
response.raise_for_status()
|
| 64 |
response_data = response.json()
|
| 65 |
data = response_data["response"]
|
|
|
|
| 66 |
return json.loads(data)
|
| 67 |
except Exception as e:
|
| 68 |
print(f"Ollama inference failed: {e}")
|
marker/services/vertex.py
CHANGED
|
@@ -18,7 +18,7 @@ class GoogleVertexService(BaseGeminiService):
|
|
| 18 |
"The name of the Google model to use for the service."
|
| 19 |
] = "gemini-1.5-flash-002"
|
| 20 |
|
| 21 |
-
def get_google_client(self, timeout: int
|
| 22 |
return genai.Client(
|
| 23 |
vertexai=True,
|
| 24 |
project=self.vertex_project_id,
|
|
|
|
| 18 |
"The name of the Google model to use for the service."
|
| 19 |
] = "gemini-1.5-flash-002"
|
| 20 |
|
| 21 |
+
def get_google_client(self, timeout: int):
|
| 22 |
return genai.Client(
|
| 23 |
vertexai=True,
|
| 24 |
project=self.vertex_project_id,
|
pyproject.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
-
version = "1.
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|
|
|
|
| 1 |
[tool.poetry]
|
| 2 |
name = "marker-pdf"
|
| 3 |
+
version = "1.5.0"
|
| 4 |
description = "Convert PDF to markdown with high speed and accuracy."
|
| 5 |
authors = ["Vik Paruchuri <github@vikas.sh>"]
|
| 6 |
readme = "README.md"
|