Spaces:
Sleeping
Sleeping
Commit ·
41ee299
1
Parent(s): a49c5dc
Update PDF to Markdown converter API with NVIDIA L4 support
Browse files- Dockerfile +6 -0
- pdf_converter/convert_pdf_to_md.py +15 -0
Dockerfile
CHANGED
|
@@ -59,12 +59,18 @@ COPY --chown=user:user . .
|
|
| 59 |
RUN mkdir -p /app/docker_mineru/output/images && \
|
| 60 |
chown -R user:user /app/docker_mineru/output
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
# Set the user
|
| 63 |
USER user
|
| 64 |
|
| 65 |
# Environment variables for caching (optional, might help with model downloads)
|
| 66 |
ENV HF_HOME=/home/user/.cache/huggingface
|
| 67 |
ENV TORCH_HOME=/home/user/.cache/torch
|
|
|
|
|
|
|
| 68 |
|
| 69 |
# Expose the port
|
| 70 |
EXPOSE 7860
|
|
|
|
| 59 |
RUN mkdir -p /app/docker_mineru/output/images && \
|
| 60 |
chown -R user:user /app/docker_mineru/output
|
| 61 |
|
| 62 |
+
# Create marker static directory and set proper permissions (fix for font download error)
|
| 63 |
+
RUN mkdir -p /usr/local/lib/python3.10/dist-packages/static && \
|
| 64 |
+
chmod -R 777 /usr/local/lib/python3.10/dist-packages/static
|
| 65 |
+
|
| 66 |
# Set the user
|
| 67 |
USER user
|
| 68 |
|
| 69 |
# Environment variables for caching (optional, might help with model downloads)
|
| 70 |
ENV HF_HOME=/home/user/.cache/huggingface
|
| 71 |
ENV TORCH_HOME=/home/user/.cache/torch
|
| 72 |
+
# Add environment variable for marker font path (alternative fix)
|
| 73 |
+
ENV MARKER_FONT_PATH=/home/user/.cache/marker_fonts
|
| 74 |
|
| 75 |
# Expose the port
|
| 76 |
EXPOSE 7860
|
pdf_converter/convert_pdf_to_md.py
CHANGED
|
@@ -13,6 +13,21 @@ def initialize_converter():
|
|
| 13 |
if _converter is None:
|
| 14 |
print("Initializing marker models...")
|
| 15 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Create configuration, explicitly setting output format
|
| 17 |
# Potential optimization: Check if batch_multiplier or similar exists
|
| 18 |
config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
|
|
|
|
| 13 |
if _converter is None:
|
| 14 |
print("Initializing marker models...")
|
| 15 |
try:
|
| 16 |
+
# Set custom font path from environment variable if available
|
| 17 |
+
font_path = os.environ.get('MARKER_FONT_PATH')
|
| 18 |
+
if font_path:
|
| 19 |
+
try:
|
| 20 |
+
# Import marker settings and override font path
|
| 21 |
+
from marker import settings
|
| 22 |
+
os.makedirs(font_path, exist_ok=True)
|
| 23 |
+
custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf')
|
| 24 |
+
settings.FONT_PATH = custom_font_path
|
| 25 |
+
print(f"Using custom font path: {custom_font_path}")
|
| 26 |
+
except ImportError:
|
| 27 |
+
print("Could not import marker settings, using default font path")
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error setting custom font path: {e}", file=sys.stderr)
|
| 30 |
+
|
| 31 |
# Create configuration, explicitly setting output format
|
| 32 |
# Potential optimization: Check if batch_multiplier or similar exists
|
| 33 |
config_parser = ConfigParser({'output_format': 'markdown'}) # Add batch_multiplier here if applicable
|