Spaces:

sanali209
/

ocrser

Sleeping

File size: 1,487 Bytes

8e70c48
c08c2fc
 
 
 
8e70c48
 
c08c2fc
 
8e70c48
c08c2fc
2074c35
c08c2fc
 
 
 
 
 
 
 
83c5b00
 
 
c08c2fc
83c5b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c08c2fc
 
 
8e70c48

FROM python:3.12-slim

WORKDIR /app

# Install system dependencies
# libgl1 and libglib2.0-0 are often required for image processing libraries (like cv2)
# libgl1-mesa-glx is deprecated/unavailable in newer Debian versions
RUN apt-get update && apt-get install -y \
    build-essential \
    libgl1 \
    libglib2.0-0 \
    tesseract-ocr \
    && rm -rf /var/lib/apt/lists/*

COPY requirements.txt .

RUN pip install --no-cache-dir -r requirements.txt

# Create a non-root user (Standard for Hugging Face Spaces)
RUN useradd -m -u 1000 user

# Set environment variables to ensure models are downloaded to a writable directory
# Docling/RapidOCR and HuggingFace Hub use these
ENV HOME=/home/user \
    PATH=/home/user/.local/bin:$PATH \
    HF_HOME=/home/user/.cache/huggingface \
    RAPIDOCR_CACHE_DIR=/home/user/.cache/rapidocr \
    XDG_CACHE_HOME=/home/user/.cache

# Create cache directories and set permissions
RUN mkdir -p /home/user/.cache/huggingface \
    /home/user/.cache/rapidocr \
    && chown -R user:user /home/user

# Switch to user to run the pre-download script
# This ensures files are owned by 'user' and not 'root'
USER user

# Pre-download models by initializing the converter once
# This will trigger downloads into the writable cache directories defined above
RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()"

COPY --chown=user:user . .

EXPOSE 7860

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]