Spaces:
Sleeping
Sleeping
Commit
·
a3cafa2
1
Parent(s):
671205b
Update PDF to Markdown converter API with NVIDIA L4 support
Browse files
pdf_converter/convert_pdf_to_md.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import marker
|
| 2 |
import os
|
| 3 |
import sys
|
|
|
|
|
|
|
| 4 |
|
| 5 |
def convert_pdf(pdf_input_path, output_md_path=None):
|
| 6 |
"""
|
|
@@ -20,11 +22,27 @@ def convert_pdf(pdf_input_path, output_md_path=None):
|
|
| 20 |
print(f"Starting conversion of '{pdf_input_path}'...")
|
| 21 |
|
| 22 |
try:
|
| 23 |
-
#
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# Convert the PDF to markdown using marker
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# If output path is provided, save the markdown
|
| 30 |
if output_md_path:
|
|
@@ -40,4 +58,5 @@ def convert_pdf(pdf_input_path, output_md_path=None):
|
|
| 40 |
|
| 41 |
except Exception as e:
|
| 42 |
print(f"An error occurred during conversion: {e}", file=sys.stderr)
|
|
|
|
| 43 |
raise
|
|
|
|
| 1 |
import marker
|
| 2 |
import os
|
| 3 |
import sys
|
| 4 |
+
from marker.config.parser import ConfigParser
|
| 5 |
+
from marker.models import create_model_dict
|
| 6 |
|
| 7 |
def convert_pdf(pdf_input_path, output_md_path=None):
|
| 8 |
"""
|
|
|
|
| 22 |
print(f"Starting conversion of '{pdf_input_path}'...")
|
| 23 |
|
| 24 |
try:
|
| 25 |
+
# Create configuration
|
| 26 |
+
config_parser = ConfigParser({})
|
| 27 |
+
|
| 28 |
+
# Load models
|
| 29 |
+
models = create_model_dict()
|
| 30 |
+
|
| 31 |
+
# Get converter class and create converter
|
| 32 |
+
converter_cls = config_parser.get_converter_cls()
|
| 33 |
+
converter = converter_cls(
|
| 34 |
+
config=config_parser.generate_config_dict(),
|
| 35 |
+
artifact_dict=models,
|
| 36 |
+
processor_list=config_parser.get_processors(),
|
| 37 |
+
renderer=config_parser.get_renderer(),
|
| 38 |
+
llm_service=config_parser.get_llm_service()
|
| 39 |
+
)
|
| 40 |
|
| 41 |
# Convert the PDF to markdown using marker
|
| 42 |
+
result = converter(pdf_input_path)
|
| 43 |
+
|
| 44 |
+
# The converter returns a dictionary with the markdown content
|
| 45 |
+
markdown_text = result.get('markdown', '')
|
| 46 |
|
| 47 |
# If output path is provided, save the markdown
|
| 48 |
if output_md_path:
|
|
|
|
| 58 |
|
| 59 |
except Exception as e:
|
| 60 |
print(f"An error occurred during conversion: {e}", file=sys.stderr)
|
| 61 |
+
print(f"Error details: {str(type(e))}", file=sys.stderr)
|
| 62 |
raise
|