Vik Paruchuri
commited on
Commit
·
3453dd8
1
Parent(s):
818619c
Small bugfix
Browse files
README.md
CHANGED
|
@@ -104,6 +104,7 @@ marker_single /path/to/file.pdf
|
|
| 104 |
Options:
|
| 105 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 106 |
- `--output_format [markdown|json|html]`: Specify the format for the output results.
|
|
|
|
| 107 |
- `--use_llm`: Uses an LLM to improve accuracy. You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
|
| 108 |
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
|
| 109 |
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
|
|
|
|
| 104 |
Options:
|
| 105 |
- `--output_dir PATH`: Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR.
|
| 106 |
- `--output_format [markdown|json|html]`: Specify the format for the output results.
|
| 107 |
+
- `--paginate_output`: Paginates the output, using `\n\n{PAGE_NUMBER}` followed by `-` * 48, then `\n\n`
|
| 108 |
- `--use_llm`: Uses an LLM to improve accuracy. You must set your Gemini API key using the `GOOGLE_API_KEY` env var.
|
| 109 |
- `--disable_image_extraction`: Don't extract images from the PDF. If you also specify `--use_llm`, then images will be replaced with a description.
|
| 110 |
- `--page_range TEXT`: Specify which pages to process. Accepts comma-separated page numbers and ranges. Example: `--page_range "0,5-10,20"` will process pages 0, 5 through 10, and page 20.
|
marker/processors/llm/llm_table_merge.py
ADDED
|
File without changes
|
marker/processors/llm/utils.py
CHANGED
|
@@ -35,7 +35,7 @@ class GoogleModel:
|
|
| 35 |
while tries < max_retries:
|
| 36 |
try:
|
| 37 |
responses = self.model.generate_content(
|
| 38 |
-
[prompt, image
|
| 39 |
stream=False,
|
| 40 |
generation_config={
|
| 41 |
"temperature": 0,
|
|
|
|
| 35 |
while tries < max_retries:
|
| 36 |
try:
|
| 37 |
responses = self.model.generate_content(
|
| 38 |
+
[image, prompt], # According to gemini docs, it performs better if the image is the first element
|
| 39 |
stream=False,
|
| 40 |
generation_config={
|
| 41 |
"temperature": 0,
|
marker/processors/sectionheader.py
CHANGED
|
@@ -54,11 +54,8 @@ class SectionHeaderProcessor(BaseProcessor):
|
|
| 54 |
heading_ranges = self.bucket_headings(flat_line_heights)
|
| 55 |
|
| 56 |
for page in document.pages:
|
| 57 |
-
for block in page.
|
| 58 |
-
|
| 59 |
-
continue
|
| 60 |
-
|
| 61 |
-
block_height = line_heights[block.id]
|
| 62 |
if block_height > 0:
|
| 63 |
for idx, (min_height, max_height) in enumerate(heading_ranges):
|
| 64 |
if block_height >= min_height * self.height_tolerance:
|
|
|
|
| 54 |
heading_ranges = self.bucket_headings(flat_line_heights)
|
| 55 |
|
| 56 |
for page in document.pages:
|
| 57 |
+
for block in page.contained_blocks(document, self.block_types):
|
| 58 |
+
block_height = line_heights.get(block.id, 0)
|
|
|
|
|
|
|
|
|
|
| 59 |
if block_height > 0:
|
| 60 |
for idx, (min_height, max_height) in enumerate(heading_ranges):
|
| 61 |
if block_height >= min_height * self.height_tolerance:
|