Update Dockerfile
Browse files- Dockerfile +21 -68
Dockerfile
CHANGED
|
@@ -6,97 +6,50 @@ ENV PYTHONUNBUFFERED=1 \
|
|
| 6 |
MDR_DEVICE=cpu \
|
| 7 |
MDR_TABLE_FORMAT=MARKDOWN \
|
| 8 |
LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
|
| 9 |
-
#
|
| 10 |
-
HF_HOME=/app/.cache/huggingface \
|
| 11 |
-
# Cache/config dirs for libraries
|
| 12 |
MPLCONFIGDIR=/app/.cache/matplotlib \
|
| 13 |
YOLO_CONFIG_DIR=/app/.config/Ultralytics
|
|
|
|
| 14 |
|
| 15 |
# Set the working directory in the container
|
| 16 |
WORKDIR /app
|
| 17 |
|
| 18 |
-
# Install system dependencies
|
| 19 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 20 |
libgl1-mesa-glx \
|
| 21 |
libglib2.0-0 \
|
| 22 |
-
libsm6 \
|
| 23 |
libxext6 \
|
| 24 |
libxrender-dev \
|
| 25 |
libfreetype6-dev \
|
| 26 |
-
|
| 27 |
-
|
| 28 |
&& rm -rf /var/lib/apt/lists/*
|
| 29 |
|
| 30 |
-
# Copy requirements
|
| 31 |
COPY requirements.txt .
|
| 32 |
|
| 33 |
-
# Install Python dependencies
|
| 34 |
-
|
| 35 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 36 |
-
pip install --no-cache-dir huggingface-hub && \
|
| 37 |
pip install --no-cache-dir -r requirements.txt
|
| 38 |
|
| 39 |
-
#
|
| 40 |
-
# Create directories needed for download AND runtime
|
| 41 |
-
RUN mkdir -p ${MDR_MODEL_DIR}/onnx_ocr \
|
| 42 |
-
${MDR_MODEL_DIR}/struct_eqtable \
|
| 43 |
-
${MDR_MODEL_DIR}/latex \
|
| 44 |
-
${MDR_MODEL_DIR}/layoutreader \
|
| 45 |
-
${MDR_MODEL_DIR}/yolo_hf_cache \
|
| 46 |
-
/app/temp_uploads \
|
| 47 |
-
${HF_HOME} \
|
| 48 |
-
${MPLCONFIGDIR} \
|
| 49 |
-
${YOLO_CONFIG_DIR} && \
|
| 50 |
-
# Set permissions broadly - adjust if needed, but often required on Spaces
|
| 51 |
-
chmod -R 777 ${MDR_MODEL_DIR} /app
|
| 52 |
-
|
| 53 |
-
# Download ONNX OCR Models (using requests/curl/wget or a helper script)
|
| 54 |
-
# Option 1: Direct download (if URLs are stable)
|
| 55 |
-
RUN curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/det/det.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/det.onnx" && \
|
| 56 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/cls/cls.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/cls.onnx" && \
|
| 57 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ppocrv4/rec/rec.onnx" -o "${MDR_MODEL_DIR}/onnx_ocr/rec.onnx" && \
|
| 58 |
-
curl -L "https://huggingface.co/moskize/OnnxOCR/resolve/main/ch_ppocr_server_v2.0/ppocr_keys_v1.txt" -o "${MDR_MODEL_DIR}/onnx_ocr/ppocr_keys_v1.txt"
|
| 59 |
-
|
| 60 |
-
# Download YOLO Model (using huggingface-cli)
|
| 61 |
-
RUN huggingface-cli download \
|
| 62 |
-
juliozhao/DocLayout-YOLO-DocStructBench \
|
| 63 |
-
doclayout_yolo_docstructbench_imgsz1024.pt \
|
| 64 |
-
--local-dir ${MDR_MODEL_DIR}/yolo_hf_cache \
|
| 65 |
-
--local-dir-use-symlinks False
|
| 66 |
-
|
| 67 |
-
# Download LaTeX Models (using requests/curl/wget or helper script)
|
| 68 |
-
RUN curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/weights.pth" -o "${MDR_MODEL_DIR}/latex/weights.pth" && \
|
| 69 |
-
curl -L "https://github.com/lukas-blecher/LaTeX-OCR/releases/download/v0.0.1/image_resizer.pth" -o "${MDR_MODEL_DIR}/latex/image_resizer.pth"
|
| 70 |
-
# Note: config.yaml for LaTeX might need to be created or copied if required by the library
|
| 71 |
-
|
| 72 |
-
# Download LayoutReader Model (using huggingface-cli)
|
| 73 |
-
# Assuming LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", ...)
|
| 74 |
-
# will handle caching correctly if HF_HOME is set and writable.
|
| 75 |
-
# If it uses a different mechanism or specific files are needed, download them explicitly.
|
| 76 |
-
# For LayoutReader using Hantian/layoutreader:
|
| 77 |
-
RUN huggingface-cli download \
|
| 78 |
-
Hantian/layoutreader \
|
| 79 |
-
--local-dir ${MDR_MODEL_DIR}/layoutreader \
|
| 80 |
-
--local-dir-use-symlinks False
|
| 81 |
-
|
| 82 |
-
# Download StructTable Model (using huggingface-cli)
|
| 83 |
-
# Assuming build_model(model_ckpt="U4R/StructTable-InternVL2-1B", ...)
|
| 84 |
-
RUN huggingface-cli download \
|
| 85 |
-
U4R/StructTable-InternVL2-1B \
|
| 86 |
-
--local-dir ${MDR_MODEL_DIR}/struct_eqtable \
|
| 87 |
-
--local-dir-use-symlinks False
|
| 88 |
-
|
| 89 |
-
# --- END PRE-DOWNLOAD MODELS ---
|
| 90 |
-
|
| 91 |
-
# Copy the application code AFTER downloads
|
| 92 |
COPY mdr_pdf_parser.py .
|
| 93 |
COPY main.py .
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
# Expose the port the app runs on
|
| 96 |
EXPOSE 8000
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
|
|
|
| 6 |
MDR_DEVICE=cpu \
|
| 7 |
MDR_TABLE_FORMAT=MARKDOWN \
|
| 8 |
LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
|
| 9 |
+
# --- ADDED: Point config/cache dirs to writable locations within /app ---
|
|
|
|
|
|
|
| 10 |
MPLCONFIGDIR=/app/.cache/matplotlib \
|
| 11 |
YOLO_CONFIG_DIR=/app/.config/Ultralytics
|
| 12 |
+
# --- END ADDED ---
|
| 13 |
|
| 14 |
# Set the working directory in the container
|
| 15 |
WORKDIR /app
|
| 16 |
|
| 17 |
+
# Install system dependencies required by OpenCV and potentially others
|
| 18 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 19 |
libgl1-mesa-glx \
|
| 20 |
libglib2.0-0 \
|
|
|
|
| 21 |
libxext6 \
|
| 22 |
libxrender-dev \
|
| 23 |
libfreetype6-dev \
|
| 24 |
+
|
| 25 |
+
|
| 26 |
&& rm -rf /var/lib/apt/lists/*
|
| 27 |
|
| 28 |
+
# Copy the requirements file into the container
|
| 29 |
COPY requirements.txt .
|
| 30 |
|
| 31 |
+
# Install Python dependencies
|
| 32 |
+
|
| 33 |
RUN pip install --no-cache-dir --upgrade pip && \
|
|
|
|
| 34 |
pip install --no-cache-dir -r requirements.txt
|
| 35 |
|
| 36 |
+
# Copy the application code into the container
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
COPY mdr_pdf_parser.py .
|
| 38 |
COPY main.py .
|
| 39 |
|
| 40 |
+
# Create the default model directory, temp directory, AND the config/cache dirs
|
| 41 |
+
# --- MODIFIED: Added creation of the new cache/config dirs ---
|
| 42 |
+
RUN mkdir -p ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache/matplotlib /app/.config/Ultralytics && \
|
| 43 |
+
chmod -R 777 ${MDR_MODEL_DIR} /app/temp_uploads /app/.cache /app/.config
|
| 44 |
+
# Note: chmod 777 is very permissive, but often necessary/easiest in restricted environments like HF Spaces.
|
| 45 |
+
# It ensures the directories are writable by the user running the application.
|
| 46 |
+
# --- END MODIFIED ---
|
| 47 |
+
|
| 48 |
# Expose the port the app runs on
|
| 49 |
EXPOSE 8000
|
| 50 |
|
| 51 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
| 52 |
+
|
| 53 |
|
| 54 |
+
# This allows mounting a host directory for persistent models
|
| 55 |
+
VOLUME ${MDR_MODEL_DIR}
|