Vik Paruchuri commited on
Commit
65d4b3a
·
1 Parent(s): 7fd45b1

Update examples, OCR fix

Browse files
README.md CHANGED
@@ -22,7 +22,7 @@ See [below](#benchmarks) for detailed speed and accuracy benchmarks, and instruc
22
 
23
  ## Hybrid Mode
24
 
25
- For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any Google model (`gemini-2.0-flash` by default), or any ollama model. See [below](#llm-services) for details.
26
 
27
  Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
28
 
 
22
 
23
  ## Hybrid Mode
24
 
25
+ For the highest accuracy, pass the `--use_llm` flag to use an LLM alongside marker. This will do things like merge tables across pages, handle inline math, format tables properly, and extract values from forms. It can use any gemini or ollama model. By default, it uses `gemini-2.0-flash`. See [below](#llm-services) for details.
26
 
27
  Here is a table benchmark comparing marker, gemini flash alone, and marker with use_llm:
28
 
marker/builders/llm_layout.py CHANGED
@@ -37,18 +37,10 @@ class LLMLayoutBuilder(LayoutBuilder):
37
  str,
38
  "The name of the Gemini model to use.",
39
  ] = "gemini-2.0-flash"
40
- max_retries: Annotated[
41
- int,
42
- "The maximum number of retries to use for the Gemini model.",
43
- ] = 2
44
  max_concurrency: Annotated[
45
  int,
46
  "The maximum number of concurrent requests to make to the Gemini model.",
47
  ] = 3
48
- timeout: Annotated[
49
- int,
50
- "The timeout for requests to the Gemini model.",
51
- ] = 60
52
  disable_tqdm: Annotated[
53
  bool,
54
  "Whether to disable the tqdm progress bar.",
@@ -162,9 +154,7 @@ Respond only with one of `Figure`, `Picture`, `ComplexRegion`, `Table`, or `Form
162
  prompt,
163
  image,
164
  block,
165
- LayoutSchema,
166
- max_retries=self.max_retries,
167
- timeout=self.timeout
168
  )
169
  generated_label = None
170
  if response and "label" in response:
 
37
  str,
38
  "The name of the Gemini model to use.",
39
  ] = "gemini-2.0-flash"
 
 
 
 
40
  max_concurrency: Annotated[
41
  int,
42
  "The maximum number of concurrent requests to make to the Gemini model.",
43
  ] = 3
 
 
 
 
44
  disable_tqdm: Annotated[
45
  bool,
46
  "Whether to disable the tqdm progress bar.",
 
154
  prompt,
155
  image,
156
  block,
157
+ LayoutSchema
 
 
158
  )
159
  generated_label = None
160
  if response and "label" in response:
marker/builders/ocr.py CHANGED
@@ -81,7 +81,7 @@ class OcrBuilder(BaseBuilder):
81
  recognition_results = self.recognition_model(
82
  images=images,
83
  bboxes=line_boxes,
84
- langs=[self.languages] * len(document.pages),
85
  recognition_batch_size=int(self.get_recognition_batch_size()),
86
  sort_lines=False
87
  )
 
81
  recognition_results = self.recognition_model(
82
  images=images,
83
  bboxes=line_boxes,
84
+ langs=[self.languages] * len(pages),
85
  recognition_batch_size=int(self.get_recognition_batch_size()),
86
  sort_lines=False
87
  )
marker/processors/blockquote.py CHANGED
@@ -17,7 +17,7 @@ class BlockquoteProcessor(BaseProcessor):
17
  float,
18
  "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19
  "Expressed as a percentage of the block width.",
20
- ] = 0.05
21
  x_start_tolerance: Annotated[
22
  float,
23
  "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
 
17
  float,
18
  "The minimum horizontal indentation required to consider a block as part of a blockquote.",
19
  "Expressed as a percentage of the block width.",
20
+ ] = 0.1
21
  x_start_tolerance: Annotated[
22
  float,
23
  "The maximum allowable difference between the starting x-coordinates of consecutive blocks to consider them aligned.",
marker/processors/llm/__init__.py CHANGED
@@ -42,18 +42,10 @@ class BaseLLMProcessor(BaseProcessor):
42
  str,
43
  "The name of the Gemini model to use.",
44
  ] = "gemini-2.0-flash"
45
- max_retries: Annotated[
46
- int,
47
- "The maximum number of retries to use for the Gemini model.",
48
- ] = 1
49
  max_concurrency: Annotated[
50
  int,
51
  "The maximum number of concurrent requests to make to the Gemini model.",
52
  ] = 3
53
- timeout: Annotated[
54
- int,
55
- "The timeout for requests to the Gemini model.",
56
- ] = 20
57
  image_expansion_ratio: Annotated[
58
  float,
59
  "The ratio to expand the image by when cropping.",
 
42
  str,
43
  "The name of the Gemini model to use.",
44
  ] = "gemini-2.0-flash"
 
 
 
 
45
  max_concurrency: Annotated[
46
  int,
47
  "The maximum number of concurrent requests to make to the Gemini model.",
48
  ] = 3
 
 
 
 
49
  image_expansion_ratio: Annotated[
50
  float,
51
  "The ratio to expand the image by when cropping.",
marker/services/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Optional, List
2
 
3
  import PIL
4
  from pydantic import BaseModel
@@ -8,6 +8,15 @@ from marker.util import assign_config, verify_config_keys
8
 
9
 
10
  class BaseService:
 
 
 
 
 
 
 
 
 
11
  def __init__(self, config: Optional[BaseModel | dict] = None):
12
  assign_config(self, config)
13
 
@@ -20,7 +29,7 @@ class BaseService:
20
  image: PIL.Image.Image | List[PIL.Image.Image],
21
  block: Block,
22
  response_schema: type[BaseModel],
23
- max_retries: int = 1,
24
- timeout: int = 15
25
  ):
26
  raise NotImplementedError
 
1
+ from typing import Optional, List, Annotated
2
 
3
  import PIL
4
  from pydantic import BaseModel
 
8
 
9
 
10
  class BaseService:
11
+ timeout: Annotated[
12
+ int,
13
+ "The timeout to use for the service."
14
+ ] = 15
15
+ max_retries: Annotated[
16
+ int,
17
+ "The maximum number of retries to use for the service."
18
+ ] = 1
19
+
20
  def __init__(self, config: Optional[BaseModel | dict] = None):
21
  assign_config(self, config)
22
 
 
29
  image: PIL.Image.Image | List[PIL.Image.Image],
30
  block: Block,
31
  response_schema: type[BaseModel],
32
+ max_retries: int | None = None,
33
+ timeout: int | None = None
34
  ):
35
  raise NotImplementedError
marker/services/gemini.py CHANGED
@@ -23,7 +23,7 @@ class BaseGeminiService(BaseService):
23
  img.save(image_bytes, format="PNG")
24
  return image_bytes.getvalue()
25
 
26
- def get_google_client(self, timeout: int = 60):
27
  raise NotImplementedError
28
 
29
  def __call__(
@@ -32,9 +32,15 @@ class BaseGeminiService(BaseService):
32
  image: PIL.Image.Image | List[PIL.Image.Image],
33
  block: Block,
34
  response_schema: type[BaseModel],
35
- max_retries: int = 1,
36
- timeout: int = 15
37
  ):
 
 
 
 
 
 
38
  if not isinstance(image, list):
39
  image = [image]
40
 
@@ -80,7 +86,7 @@ class GoogleGeminiService(BaseGeminiService):
80
  "The Google API key to use for the service."
81
  ] = None
82
 
83
- def get_google_client(self, timeout: int = 60):
84
  return genai.Client(
85
  api_key=self.gemini_api_key,
86
  http_options={"timeout": timeout * 1000} # Convert to milliseconds
 
23
  img.save(image_bytes, format="PNG")
24
  return image_bytes.getvalue()
25
 
26
+ def get_google_client(self, timeout: int):
27
  raise NotImplementedError
28
 
29
  def __call__(
 
32
  image: PIL.Image.Image | List[PIL.Image.Image],
33
  block: Block,
34
  response_schema: type[BaseModel],
35
+ max_retries: int | None = None,
36
+ timeout: int | None = None
37
  ):
38
+ if max_retries is None:
39
+ max_retries = self.max_retries
40
+
41
+ if timeout is None:
42
+ timeout = self.timeout
43
+
44
  if not isinstance(image, list):
45
  image = [image]
46
 
 
86
  "The Google API key to use for the service."
87
  ] = None
88
 
89
+ def get_google_client(self, timeout: int):
90
  return genai.Client(
91
  api_key=self.gemini_api_key,
92
  http_options={"timeout": timeout * 1000} # Convert to milliseconds
marker/services/ollama.py CHANGED
@@ -32,8 +32,8 @@ class OllamaService(BaseService):
32
  image: PIL.Image.Image | List[PIL.Image.Image],
33
  block: Block,
34
  response_schema: type[BaseModel],
35
- max_retries: int = 1,
36
- timeout: int = 15
37
  ):
38
  url = f"{self.ollama_base_url}/api/generate"
39
  headers = {"Content-Type": "application/json"}
@@ -63,7 +63,6 @@ class OllamaService(BaseService):
63
  response.raise_for_status()
64
  response_data = response.json()
65
  data = response_data["response"]
66
- print(data)
67
  return json.loads(data)
68
  except Exception as e:
69
  print(f"Ollama inference failed: {e}")
 
32
  image: PIL.Image.Image | List[PIL.Image.Image],
33
  block: Block,
34
  response_schema: type[BaseModel],
35
+ max_retries: int | None = None,
36
+ timeout: int | None = None
37
  ):
38
  url = f"{self.ollama_base_url}/api/generate"
39
  headers = {"Content-Type": "application/json"}
 
63
  response.raise_for_status()
64
  response_data = response.json()
65
  data = response_data["response"]
 
66
  return json.loads(data)
67
  except Exception as e:
68
  print(f"Ollama inference failed: {e}")
marker/services/vertex.py CHANGED
@@ -18,7 +18,7 @@ class GoogleVertexService(BaseGeminiService):
18
  "The name of the Google model to use for the service."
19
  ] = "gemini-1.5-flash-002"
20
 
21
- def get_google_client(self, timeout: int = 60):
22
  return genai.Client(
23
  vertexai=True,
24
  project=self.vertex_project_id,
 
18
  "The name of the Google model to use for the service."
19
  ] = "gemini-1.5-flash-002"
20
 
21
+ def get_google_client(self, timeout: int):
22
  return genai.Client(
23
  vertexai=True,
24
  project=self.vertex_project_id,
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
  [tool.poetry]
2
  name = "marker-pdf"
3
- version = "1.4.0"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"
 
1
  [tool.poetry]
2
  name = "marker-pdf"
3
+ version = "1.5.0"
4
  description = "Convert PDF to markdown with high speed and accuracy."
5
  authors = ["Vik Paruchuri <github@vikas.sh>"]
6
  readme = "README.md"