Vik Paruchuri
commited on
Commit
·
74cd5a8
1
Parent(s):
92ebbe8
Add more tests
Browse files- README.md +3 -2
- tests/converters/test_pdf_converter.py +48 -0
README.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
-
Marker converts
|
| 4 |
|
| 5 |
-
-
|
| 6 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
| 7 |
- Extracts and saves images
|
| 8 |
- Removes headers/footers/other artifacts
|
|
@@ -320,6 +320,7 @@ When running with the `--use_llm` flag, you have a choice of services you can us
|
|
| 320 |
- `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration.
|
| 321 |
- `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
|
| 322 |
- `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
|
|
|
|
| 323 |
|
| 324 |
These services may have additional optional configuration as well - you can see it by viewing the classes.
|
| 325 |
|
|
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
+
Marker converts documents to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
+
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB in all languages
|
| 6 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
| 7 |
- Extracts and saves images
|
| 8 |
- Removes headers/footers/other artifacts
|
|
|
|
| 320 |
- `Gemini` - this will use the Gemini developer API by default. You'll need to pass `--gemini_api_key` to configuration.
|
| 321 |
- `Google Vertex` - this will use vertex, which can be more reliable. You'll need to pass `--vertex_project_id`. To use it, set `--llm_service=marker.services.vertex.GoogleVertexService`.
|
| 322 |
- `Ollama` - this will use local models. You can configure `--ollama_base_url` and `--ollama_model`. To use it, set `--llm_service=marker.services.ollama.OllamaService`.
|
| 323 |
+
- `Claude` - this will use the anthropic API. You can configure `--claude_api_key`, and `--claude_model_name`. To use it, set `--llm_service=marker.services.claude.ClaudeService`.
|
| 324 |
|
| 325 |
These services may have additional optional configuration as well - you can see it by viewing the classes.
|
| 326 |
|
tests/converters/test_pdf_converter.py
CHANGED
|
@@ -20,3 +20,51 @@ def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
|
|
| 20 |
# Some assertions for line joining across columns
|
| 21 |
assert "remain similar across a wide range of choices." in markdown # pg: 2
|
| 22 |
assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# Some assertions for line joining across columns
|
| 21 |
assert "remain similar across a wide range of choices." in markdown # pg: 2
|
| 22 |
assert "a new scheme for designing more robust and efficient" in markdown # pg: 8
|
| 23 |
+
|
| 24 |
+
@pytest.mark.filename("manual.epub")
|
| 25 |
+
@pytest.mark.config({"page_range": [0]})
|
| 26 |
+
def test_epub_converter(pdf_converter: PdfConverter, temp_doc):
|
| 27 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 28 |
+
markdown = markdown_output.markdown
|
| 29 |
+
|
| 30 |
+
# Basic assertions
|
| 31 |
+
assert "Simple Sabotage Field Manual" in markdown
|
| 32 |
+
|
| 33 |
+
@pytest.mark.filename("single_sheet.xlsx")
|
| 34 |
+
@pytest.mark.config({"page_range": [0]})
|
| 35 |
+
def test_xlsx_converter(pdf_converter: PdfConverter, temp_doc):
|
| 36 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 37 |
+
markdown = markdown_output.markdown
|
| 38 |
+
|
| 39 |
+
# Basic assertions
|
| 40 |
+
assert "four" in markdown
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@pytest.mark.filename("china.html")
|
| 44 |
+
@pytest.mark.config({"page_range": [10]})
|
| 45 |
+
def test_html_converter(pdf_converter: PdfConverter, temp_doc):
|
| 46 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 47 |
+
markdown = markdown_output.markdown
|
| 48 |
+
|
| 49 |
+
# Basic assertions
|
| 50 |
+
assert "Beijing" in markdown
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@pytest.mark.filename("gatsby.docx")
|
| 54 |
+
@pytest.mark.config({"page_range": [0]})
|
| 55 |
+
def test_docx_converter(pdf_converter: PdfConverter, temp_doc):
|
| 56 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 57 |
+
markdown = markdown_output.markdown
|
| 58 |
+
|
| 59 |
+
# Basic assertions
|
| 60 |
+
assert "The Decline of the American Dream in the 1920s" in markdown
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
@pytest.mark.filename("lambda.pptx")
|
| 64 |
+
@pytest.mark.config({"page_range": [0]})
|
| 65 |
+
def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
|
| 66 |
+
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
|
| 67 |
+
markdown = markdown_output.markdown
|
| 68 |
+
|
| 69 |
+
# Basic assertions
|
| 70 |
+
assert "Adam Doupé" in markdown
|