Spaces:
Runtime error
Runtime error
Commit
·
4e3d16d
0
Parent(s):
Initial deployment of MinerU PDF API
Browse files- .dockerignore +41 -0
- .gitattributes +35 -0
- Dockerfile +87 -0
- README.md +138 -0
- app.py +705 -0
- entrypoint.sh +202 -0
- requirements.txt +10 -0
- space_config.json +4 -0
.dockerignore
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git files
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python cache files
|
| 6 |
+
__pycache__/
|
| 7 |
+
*.py[cod]
|
| 8 |
+
*$py.class
|
| 9 |
+
*.so
|
| 10 |
+
.Python
|
| 11 |
+
env/
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
|
| 28 |
+
# Editor directories and files
|
| 29 |
+
.idea
|
| 30 |
+
.vscode
|
| 31 |
+
*.swp
|
| 32 |
+
*.swo
|
| 33 |
+
|
| 34 |
+
# OS files
|
| 35 |
+
.DS_Store
|
| 36 |
+
.DS_Store?
|
| 37 |
+
._*
|
| 38 |
+
.Spotlight-V100
|
| 39 |
+
.Trashes
|
| 40 |
+
ehthumbs.db
|
| 41 |
+
Thumbs.db
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM nvidia/cuda:12.1.0-base-ubuntu22.04
|
| 2 |
+
|
| 3 |
+
# Set environment variables
|
| 4 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
python3 \
|
| 10 |
+
python3-pip \
|
| 11 |
+
python3-venv \
|
| 12 |
+
python3-dev \
|
| 13 |
+
wget \
|
| 14 |
+
git \
|
| 15 |
+
build-essential \
|
| 16 |
+
libgl1-mesa-glx \
|
| 17 |
+
libglib2.0-0 \
|
| 18 |
+
imagemagick \
|
| 19 |
+
ghostscript \
|
| 20 |
+
poppler-utils \
|
| 21 |
+
libmagickwand-dev \
|
| 22 |
+
fonts-freefont-ttf \
|
| 23 |
+
ffmpeg \
|
| 24 |
+
libsm6 \
|
| 25 |
+
libxext6 \
|
| 26 |
+
libxrender-dev \
|
| 27 |
+
pkg-config \
|
| 28 |
+
libcairo2-dev \
|
| 29 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
+
|
| 31 |
+
# Configure ImageMagick policy to allow PDF conversion (needed for sample PDF creation)
|
| 32 |
+
RUN if [ -f "/etc/ImageMagick-6/policy.xml" ]; then \
|
| 33 |
+
sed -i 's/rights="none" pattern="PDF"/rights="read|write" pattern="PDF"/g' /etc/ImageMagick-6/policy.xml; \
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
+
# Create a virtual environment
|
| 37 |
+
RUN python3 -m venv /opt/mineru_venv
|
| 38 |
+
ENV PATH="/opt/mineru_venv/bin:$PATH"
|
| 39 |
+
|
| 40 |
+
# Upgrade pip in the virtual environment
|
| 41 |
+
RUN pip install --upgrade pip
|
| 42 |
+
|
| 43 |
+
# Clone the MinerU repository
|
| 44 |
+
RUN git clone https://github.com/opendatalab/MinerU.git /tmp/MinerU
|
| 45 |
+
|
| 46 |
+
# Install required packages
|
| 47 |
+
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
| 48 |
+
|
| 49 |
+
# Install MinerU with all features
|
| 50 |
+
WORKDIR /tmp/MinerU
|
| 51 |
+
RUN pip install --no-cache-dir -e ".[full]"
|
| 52 |
+
|
| 53 |
+
# Install additional dependencies for the web application
|
| 54 |
+
RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
|
| 55 |
+
|
| 56 |
+
# Create directories for uploads and output
|
| 57 |
+
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output
|
| 58 |
+
RUN mkdir -p /tmp/samples
|
| 59 |
+
|
| 60 |
+
# Create a non-root user for Hugging Face Spaces
|
| 61 |
+
# This is critical for permissions on HF Spaces
|
| 62 |
+
RUN useradd -m -u 1000 user
|
| 63 |
+
RUN mkdir -p /app/samples && chown -R user:user /app
|
| 64 |
+
|
| 65 |
+
# Download model weights
|
| 66 |
+
RUN echo "Downloading MinerU model weights..."
|
| 67 |
+
# This step will automatically download model weights during the first run
|
| 68 |
+
|
| 69 |
+
# Copy the application files
|
| 70 |
+
WORKDIR /app
|
| 71 |
+
COPY . /app/
|
| 72 |
+
|
| 73 |
+
# Fix permissions for the user
|
| 74 |
+
RUN chown -R user:user /app
|
| 75 |
+
RUN mkdir -p /home/user/.config/magic_pdf && chown -R user:user /home/user/.config
|
| 76 |
+
|
| 77 |
+
# Expose the port
|
| 78 |
+
EXPOSE 7860
|
| 79 |
+
|
| 80 |
+
# Set up entrypoint
|
| 81 |
+
RUN chmod +x /app/entrypoint.sh
|
| 82 |
+
|
| 83 |
+
# Switch to non-root user for running the app
|
| 84 |
+
USER user
|
| 85 |
+
|
| 86 |
+
# Start the application
|
| 87 |
+
CMD ["/app/entrypoint.sh"]
|
README.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MinerU PDF Converter
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
# MinerU PDF Converter
|
| 11 |
+
|
| 12 |
+
This Space provides a service for converting PDF files to Markdown and JSON formats using the MinerU PDF extraction tool.
|
| 13 |
+
|
| 14 |
+
## Features
|
| 15 |
+
|
| 16 |
+
- Web interface for uploading and converting PDF files
|
| 17 |
+
- RESTful API for programmatic access
|
| 18 |
+
- Health monitoring endpoint
|
| 19 |
+
- High-quality PDF extraction with support for tables, formulas, and complex layouts
|
| 20 |
+
- Output in both Markdown and structured JSON formats
|
| 21 |
+
- Comprehensive error handling and fallback mechanisms
|
| 22 |
+
|
| 23 |
+
## API Usage
|
| 24 |
+
|
| 25 |
+
The service exposes several API endpoints for programmatic access:
|
| 26 |
+
|
| 27 |
+
### 1. PDF Conversion Endpoint
|
| 28 |
+
|
| 29 |
+
```
|
| 30 |
+
POST /api/convert
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**Request:**
|
| 34 |
+
- Content-Type: multipart/form-data
|
| 35 |
+
- Body: form field 'file' containing the PDF file
|
| 36 |
+
|
| 37 |
+
**Response:**
|
| 38 |
+
```json
|
| 39 |
+
{
|
| 40 |
+
"success": true,
|
| 41 |
+
"message": "PDF conversion successful",
|
| 42 |
+
"job_id": "uuid",
|
| 43 |
+
"base_filename": "filename",
|
| 44 |
+
"file_info": {
|
| 45 |
+
"original_filename": "document.pdf",
|
| 46 |
+
"size_bytes": 42950,
|
| 47 |
+
"content_type": "application/pdf"
|
| 48 |
+
},
|
| 49 |
+
"markdown": "# Converted markdown content...",
|
| 50 |
+
"json": {
|
| 51 |
+
"title": "Document Title",
|
| 52 |
+
"sections": [...]
|
| 53 |
+
},
|
| 54 |
+
"log": "Processing log...",
|
| 55 |
+
"files": {
|
| 56 |
+
"markdown_path": "document.md",
|
| 57 |
+
"json_path": "document.json"
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
### 2. Health Check Endpoint
|
| 63 |
+
|
| 64 |
+
```
|
| 65 |
+
GET /health
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
**Response:**
|
| 69 |
+
```json
|
| 70 |
+
{
|
| 71 |
+
"status": "healthy",
|
| 72 |
+
"version": "1.1.0",
|
| 73 |
+
"environment": {
|
| 74 |
+
"python_version": "3.10.12",
|
| 75 |
+
"platform": "Linux-6.1.58+-x86_64-with-glibc2.35",
|
| 76 |
+
"processor": "x86_64"
|
| 77 |
+
},
|
| 78 |
+
"configuration": {
|
| 79 |
+
"upload_folder_exists": true,
|
| 80 |
+
"output_folder_exists": true,
|
| 81 |
+
"magic_pdf_installed": true
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### Client Example
|
| 87 |
+
|
| 88 |
+
A Python client script (`api_client.py`) is included in this repository for easy integration:
|
| 89 |
+
|
| 90 |
+
```python
|
| 91 |
+
# Example usage
|
| 92 |
+
python api_client.py path/to/your/document.pdf --api-url https://marcosremar2-mineru.hf.space
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
The client includes features such as:
|
| 96 |
+
- Automatic health check to verify API status
|
| 97 |
+
- Retry logic for failed requests
|
| 98 |
+
- Progress tracking
|
| 99 |
+
- Comprehensive error handling
|
| 100 |
+
|
| 101 |
+
You can also use curl:
|
| 102 |
+
|
| 103 |
+
```bash
|
| 104 |
+
curl -X POST -F "file=@path/to/your/document.pdf" https://marcosremar2-mineru.hf.space/api/convert
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
And check health with:
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
curl https://marcosremar2-mineru.hf.space/health
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
## Web Interface
|
| 114 |
+
|
| 115 |
+
The Space also provides a web interface where you can:
|
| 116 |
+
- Upload PDF files for conversion
|
| 117 |
+
- View the generated Markdown and JSON
|
| 118 |
+
- Download the converted files
|
| 119 |
+
- View processing logs
|
| 120 |
+
|
| 121 |
+
## Implementation Details
|
| 122 |
+
|
| 123 |
+
This service uses:
|
| 124 |
+
- MinerU for high-quality PDF extraction
|
| 125 |
+
- PyMuPDF as a fallback conversion method
|
| 126 |
+
- Flask web server for the interface and API
|
| 127 |
+
- Docker container for deployment on Hugging Face Spaces
|
| 128 |
+
|
| 129 |
+
## Error Handling
|
| 130 |
+
|
| 131 |
+
The service includes robust error handling:
|
| 132 |
+
- Automatic fallback to local PDF conversion if MinerU is unavailable
|
| 133 |
+
- Detailed error messages and logs
|
| 134 |
+
- API responses include comprehensive details for debugging
|
| 135 |
+
|
| 136 |
+
## Learn More
|
| 137 |
+
|
| 138 |
+
For more information about MinerU, visit [the MinerU repository](https://github.com/opendatalab/MinerU).
|
app.py
ADDED
|
@@ -0,0 +1,705 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, jsonify, render_template_string, redirect, url_for, send_from_directory
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
import tempfile
|
| 5 |
+
import uuid
|
| 6 |
+
import json
|
| 7 |
+
import shutil
|
| 8 |
+
import time
|
| 9 |
+
import platform
|
| 10 |
+
import sys
|
| 11 |
+
from werkzeug.utils import secure_filename
|
| 12 |
+
from flask_cors import CORS # Add CORS support
|
| 13 |
+
|
| 14 |
+
app = Flask(__name__)
|
| 15 |
+
CORS(app) # Enable CORS for all routes
|
| 16 |
+
|
| 17 |
+
# Use user home directory for better permission handling
|
| 18 |
+
USER_HOME = os.path.expanduser("~")
|
| 19 |
+
UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
|
| 20 |
+
OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
|
| 21 |
+
|
| 22 |
+
# Create upload and output directories
|
| 23 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 24 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
# Version information
|
| 27 |
+
APP_VERSION = "1.1.0"
|
| 28 |
+
|
| 29 |
+
HTML_TEMPLATE = """
|
| 30 |
+
<!DOCTYPE html>
|
| 31 |
+
<html>
|
| 32 |
+
<head>
|
| 33 |
+
<title>MinerU PDF Processing</title>
|
| 34 |
+
<style>
|
| 35 |
+
body {
|
| 36 |
+
font-family: Arial, sans-serif;
|
| 37 |
+
max-width: 900px;
|
| 38 |
+
margin: 0 auto;
|
| 39 |
+
padding: 20px;
|
| 40 |
+
line-height: 1.6;
|
| 41 |
+
}
|
| 42 |
+
.container {
|
| 43 |
+
background-color: #f9f9f9;
|
| 44 |
+
padding: 20px;
|
| 45 |
+
border-radius: 8px;
|
| 46 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 47 |
+
margin-bottom: 20px;
|
| 48 |
+
}
|
| 49 |
+
h1 {
|
| 50 |
+
color: #2c3e50;
|
| 51 |
+
}
|
| 52 |
+
pre {
|
| 53 |
+
background-color: #f1f1f1;
|
| 54 |
+
padding: 10px;
|
| 55 |
+
border-radius: 4px;
|
| 56 |
+
overflow-x: auto;
|
| 57 |
+
max-height: 300px;
|
| 58 |
+
overflow-y: auto;
|
| 59 |
+
}
|
| 60 |
+
.command {
|
| 61 |
+
font-family: monospace;
|
| 62 |
+
background-color: #eee;
|
| 63 |
+
padding: 5px;
|
| 64 |
+
border-radius: 3px;
|
| 65 |
+
}
|
| 66 |
+
.upload-form {
|
| 67 |
+
margin: 20px 0;
|
| 68 |
+
padding: 15px;
|
| 69 |
+
border: 1px solid #ddd;
|
| 70 |
+
border-radius: 8px;
|
| 71 |
+
}
|
| 72 |
+
.btn {
|
| 73 |
+
background-color: #4CAF50;
|
| 74 |
+
color: white;
|
| 75 |
+
padding: 8px 16px;
|
| 76 |
+
border: none;
|
| 77 |
+
border-radius: 4px;
|
| 78 |
+
cursor: pointer;
|
| 79 |
+
font-size: 16px;
|
| 80 |
+
}
|
| 81 |
+
.btn:hover {
|
| 82 |
+
background-color: #45a049;
|
| 83 |
+
}
|
| 84 |
+
.loading {
|
| 85 |
+
display: none;
|
| 86 |
+
color: #666;
|
| 87 |
+
margin-top: 10px;
|
| 88 |
+
}
|
| 89 |
+
.result-section {
|
| 90 |
+
margin-top: 20px;
|
| 91 |
+
}
|
| 92 |
+
.tab {
|
| 93 |
+
overflow: hidden;
|
| 94 |
+
border: 1px solid #ccc;
|
| 95 |
+
background-color: #f1f1f1;
|
| 96 |
+
margin-top: 20px;
|
| 97 |
+
}
|
| 98 |
+
.tab button {
|
| 99 |
+
background-color: inherit;
|
| 100 |
+
float: left;
|
| 101 |
+
border: none;
|
| 102 |
+
outline: none;
|
| 103 |
+
cursor: pointer;
|
| 104 |
+
padding: 10px 16px;
|
| 105 |
+
transition: 0.3s;
|
| 106 |
+
}
|
| 107 |
+
.tab button:hover {
|
| 108 |
+
background-color: #ddd;
|
| 109 |
+
}
|
| 110 |
+
.tab button.active {
|
| 111 |
+
background-color: #ccc;
|
| 112 |
+
}
|
| 113 |
+
.tabcontent {
|
| 114 |
+
display: none;
|
| 115 |
+
padding: 6px 12px;
|
| 116 |
+
border: 1px solid #ccc;
|
| 117 |
+
border-top: none;
|
| 118 |
+
max-height: 500px;
|
| 119 |
+
overflow-y: auto;
|
| 120 |
+
white-space: pre-wrap;
|
| 121 |
+
}
|
| 122 |
+
</style>
|
| 123 |
+
</head>
|
| 124 |
+
<body>
|
| 125 |
+
<div class="container">
|
| 126 |
+
<h1>MinerU PDF Processing Service</h1>
|
| 127 |
+
<p>This service uses MinerU to convert PDF documents to Markdown and JSON formats.</p>
|
| 128 |
+
|
| 129 |
+
<h2>GPU Status</h2>
|
| 130 |
+
<pre id="gpuStatus">Loading...</pre>
|
| 131 |
+
|
| 132 |
+
<div class="upload-form">
|
| 133 |
+
<h2>Convert PDF File</h2>
|
| 134 |
+
<form action="/convert" method="post" enctype="multipart/form-data" id="uploadForm">
|
| 135 |
+
<input type="file" name="file" accept=".pdf" required>
|
| 136 |
+
<button type="submit" class="btn">Convert PDF</button>
|
| 137 |
+
</form>
|
| 138 |
+
<div id="loadingIndicator" class="loading">Processing PDF file... This may take a minute.</div>
|
| 139 |
+
</div>
|
| 140 |
+
|
| 141 |
+
<div class="result-section" id="resultSection" style="display: none;">
|
| 142 |
+
<h2>Conversion Results</h2>
|
| 143 |
+
<div id="resultInfo"></div>
|
| 144 |
+
|
| 145 |
+
<div class="tab">
|
| 146 |
+
<button class="tablinks" onclick="openTab(event, 'Markdown')" id="defaultOpen">Markdown</button>
|
| 147 |
+
<button class="tablinks" onclick="openTab(event, 'JSON')">JSON</button>
|
| 148 |
+
<button class="tablinks" onclick="openTab(event, 'Log')">Processing Log</button>
|
| 149 |
+
</div>
|
| 150 |
+
|
| 151 |
+
<div id="Markdown" class="tabcontent">
|
| 152 |
+
<pre id="markdownContent"></pre>
|
| 153 |
+
<a id="downloadMarkdown" class="btn" style="margin-top: 10px;">Download Markdown</a>
|
| 154 |
+
</div>
|
| 155 |
+
|
| 156 |
+
<div id="JSON" class="tabcontent">
|
| 157 |
+
<pre id="jsonContent"></pre>
|
| 158 |
+
<a id="downloadJson" class="btn" style="margin-top: 10px;">Download JSON</a>
|
| 159 |
+
</div>
|
| 160 |
+
|
| 161 |
+
<div id="Log" class="tabcontent">
|
| 162 |
+
<pre id="logContent"></pre>
|
| 163 |
+
</div>
|
| 164 |
+
</div>
|
| 165 |
+
|
| 166 |
+
<h2>Available Commands</h2>
|
| 167 |
+
<p>MinerU provides the following commands:</p>
|
| 168 |
+
<p><span class="command">magic-pdf</span> - Process PDF documents</p>
|
| 169 |
+
|
| 170 |
+
<h2>Help Output</h2>
|
| 171 |
+
<pre id="helpOutput">Loading...</pre>
|
| 172 |
+
</div>
|
| 173 |
+
|
| 174 |
+
<script>
|
| 175 |
+
// Fetch GPU status
|
| 176 |
+
fetch('/gpu-status')
|
| 177 |
+
.then(response => response.json())
|
| 178 |
+
.then(data => {
|
| 179 |
+
document.getElementById('gpuStatus').textContent = data.output;
|
| 180 |
+
})
|
| 181 |
+
.catch(error => {
|
| 182 |
+
document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
|
| 183 |
+
});
|
| 184 |
+
|
| 185 |
+
// Fetch help output
|
| 186 |
+
fetch('/help-output')
|
| 187 |
+
.then(response => response.json())
|
| 188 |
+
.then(data => {
|
| 189 |
+
document.getElementById('helpOutput').textContent = data.output;
|
| 190 |
+
})
|
| 191 |
+
.catch(error => {
|
| 192 |
+
document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
|
| 193 |
+
});
|
| 194 |
+
|
| 195 |
+
// Tab functionality
|
| 196 |
+
function openTab(evt, tabName) {
|
| 197 |
+
var i, tabcontent, tablinks;
|
| 198 |
+
tabcontent = document.getElementsByClassName("tabcontent");
|
| 199 |
+
for (i = 0; i < tabcontent.length; i++) {
|
| 200 |
+
tabcontent[i].style.display = "none";
|
| 201 |
+
}
|
| 202 |
+
tablinks = document.getElementsByClassName("tablinks");
|
| 203 |
+
for (i = 0; i < tablinks.length; i++) {
|
| 204 |
+
tablinks[i].className = tablinks[i].className.replace(" active", "");
|
| 205 |
+
}
|
| 206 |
+
document.getElementById(tabName).style.display = "block";
|
| 207 |
+
evt.currentTarget.className += " active";
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
// Set up form submission
|
| 211 |
+
document.getElementById('uploadForm').addEventListener('submit', function(e) {
|
| 212 |
+
e.preventDefault();
|
| 213 |
+
|
| 214 |
+
const loadingIndicator = document.getElementById('loadingIndicator');
|
| 215 |
+
loadingIndicator.style.display = 'block';
|
| 216 |
+
|
| 217 |
+
const resultSection = document.getElementById('resultSection');
|
| 218 |
+
resultSection.style.display = 'none';
|
| 219 |
+
|
| 220 |
+
const formData = new FormData(this);
|
| 221 |
+
|
| 222 |
+
fetch('/convert', {
|
| 223 |
+
method: 'POST',
|
| 224 |
+
body: formData
|
| 225 |
+
})
|
| 226 |
+
.then(response => response.json())
|
| 227 |
+
.then(data => {
|
| 228 |
+
loadingIndicator.style.display = 'none';
|
| 229 |
+
resultSection.style.display = 'block';
|
| 230 |
+
|
| 231 |
+
document.getElementById('resultInfo').textContent = data.message;
|
| 232 |
+
|
| 233 |
+
// Handle Markdown content
|
| 234 |
+
if (data.markdown) {
|
| 235 |
+
document.getElementById('markdownContent').textContent = data.markdown;
|
| 236 |
+
const downloadMarkdown = document.getElementById('downloadMarkdown');
|
| 237 |
+
downloadMarkdown.href = data.markdown_url;
|
| 238 |
+
downloadMarkdown.download = data.base_filename + '.md';
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
// Handle JSON content
|
| 242 |
+
if (data.json) {
|
| 243 |
+
document.getElementById('jsonContent').textContent = JSON.stringify(data.json, null, 2);
|
| 244 |
+
const downloadJson = document.getElementById('downloadJson');
|
| 245 |
+
downloadJson.href = data.json_url;
|
| 246 |
+
downloadJson.download = data.base_filename + '.json';
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
// Handle log content
|
| 250 |
+
if (data.log) {
|
| 251 |
+
document.getElementById('logContent').textContent = data.log;
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
// Open the markdown tab by default
|
| 255 |
+
document.getElementById('defaultOpen').click();
|
| 256 |
+
})
|
| 257 |
+
.catch(error => {
|
| 258 |
+
loadingIndicator.style.display = 'none';
|
| 259 |
+
alert('Error: ' + error.message);
|
| 260 |
+
});
|
| 261 |
+
});
|
| 262 |
+
</script>
|
| 263 |
+
</body>
|
| 264 |
+
</html>
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
@app.route('/')
|
| 268 |
+
def index():
|
| 269 |
+
return render_template_string(HTML_TEMPLATE)
|
| 270 |
+
|
| 271 |
+
@app.route('/gpu-status')
|
| 272 |
+
def gpu_status():
|
| 273 |
+
import subprocess
|
| 274 |
+
try:
|
| 275 |
+
output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 276 |
+
except subprocess.CalledProcessError as e:
|
| 277 |
+
output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
|
| 278 |
+
except FileNotFoundError:
|
| 279 |
+
output = "nvidia-smi command not found. GPU may not be available."
|
| 280 |
+
return jsonify({"output": output})
|
| 281 |
+
|
| 282 |
+
@app.route('/help-output')
|
| 283 |
+
def help_output():
|
| 284 |
+
import subprocess
|
| 285 |
+
try:
|
| 286 |
+
output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 287 |
+
except subprocess.CalledProcessError as e:
|
| 288 |
+
output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
|
| 289 |
+
except FileNotFoundError:
|
| 290 |
+
output = "magic-pdf command not found. MinerU may not be installed correctly."
|
| 291 |
+
return jsonify({"output": output})
|
| 292 |
+
|
| 293 |
+
@app.route('/convert', methods=['POST'])
|
| 294 |
+
def convert_pdf():
|
| 295 |
+
if 'file' not in request.files:
|
| 296 |
+
return jsonify({"error": "No file part"}), 400
|
| 297 |
+
|
| 298 |
+
file = request.files['file']
|
| 299 |
+
if file.filename == '':
|
| 300 |
+
return jsonify({"error": "No selected file"}), 400
|
| 301 |
+
|
| 302 |
+
if not file.filename.lower().endswith('.pdf'):
|
| 303 |
+
return jsonify({"error": "File must be a PDF"}), 400
|
| 304 |
+
|
| 305 |
+
# Generate a unique ID for this conversion
|
| 306 |
+
job_id = str(uuid.uuid4())
|
| 307 |
+
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 308 |
+
os.makedirs(job_dir, exist_ok=True)
|
| 309 |
+
|
| 310 |
+
# Save the uploaded file
|
| 311 |
+
filename = secure_filename(file.filename)
|
| 312 |
+
base_filename = os.path.splitext(filename)[0]
|
| 313 |
+
pdf_path = os.path.join(job_dir, filename)
|
| 314 |
+
file.save(pdf_path)
|
| 315 |
+
|
| 316 |
+
# Run magic-pdf on the file
|
| 317 |
+
output_dir = os.path.join(job_dir, "output")
|
| 318 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 319 |
+
|
| 320 |
+
log_file = os.path.join(job_dir, "conversion.log")
|
| 321 |
+
|
| 322 |
+
try:
|
| 323 |
+
# Define the default config dictionary first
|
| 324 |
+
default_config = {
|
| 325 |
+
"device-mode": "cpu",
|
| 326 |
+
"layout-config": {
|
| 327 |
+
"model": "doclayout_yolo",
|
| 328 |
+
"enable": True
|
| 329 |
+
},
|
| 330 |
+
"formula-config": {
|
| 331 |
+
"mfd_model": "yolo_v8_mfd",
|
| 332 |
+
"mfr_model": "unimernet_small",
|
| 333 |
+
"enable": True
|
| 334 |
+
},
|
| 335 |
+
"table-config": {
|
| 336 |
+
"model": "rapid_table",
|
| 337 |
+
"sub_model": "slanet_plus",
|
| 338 |
+
"enable": True,
|
| 339 |
+
"max_time": 400
|
| 340 |
+
}
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
# Create the magic-pdf.json configuration file in .config if it doesn't exist
|
| 344 |
+
config_dir = os.path.expanduser("~/.config/magic_pdf")
|
| 345 |
+
os.makedirs(config_dir, exist_ok=True)
|
| 346 |
+
config_file = os.path.join(config_dir, "magic-pdf.json")
|
| 347 |
+
|
| 348 |
+
if not os.path.exists(config_file):
|
| 349 |
+
with open(config_file, 'w') as f:
|
| 350 |
+
json.dump(default_config, f, indent=2)
|
| 351 |
+
|
| 352 |
+
# Also create the config in the home directory as fallback
|
| 353 |
+
home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
|
| 354 |
+
if not os.path.exists(home_config_file):
|
| 355 |
+
with open(home_config_file, 'w') as f:
|
| 356 |
+
json.dump(default_config, f, indent=2)
|
| 357 |
+
|
| 358 |
+
# Add a small delay to ensure config file is written before magic-pdf runs
|
| 359 |
+
time.sleep(0.5)
|
| 360 |
+
|
| 361 |
+
# Use magic-pdf to convert the PDF to Markdown and JSON
|
| 362 |
+
cmd = [
|
| 363 |
+
'magic-pdf',
|
| 364 |
+
'--path', pdf_path,
|
| 365 |
+
'--output-dir', output_dir
|
| 366 |
+
]
|
| 367 |
+
|
| 368 |
+
# Run the command and capture output
|
| 369 |
+
with open(log_file, 'w') as f:
|
| 370 |
+
process = subprocess.Popen(
|
| 371 |
+
cmd,
|
| 372 |
+
stdout=subprocess.PIPE,
|
| 373 |
+
stderr=subprocess.STDOUT,
|
| 374 |
+
text=True,
|
| 375 |
+
bufsize=1
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
# Write process output to log file in real-time
|
| 379 |
+
for line in process.stdout:
|
| 380 |
+
f.write(line)
|
| 381 |
+
f.flush()
|
| 382 |
+
|
| 383 |
+
process.wait()
|
| 384 |
+
|
| 385 |
+
if process.returncode != 0:
|
| 386 |
+
return jsonify({
|
| 387 |
+
"error": f"PDF conversion failed with code {process.returncode}",
|
| 388 |
+
"log": open(log_file, 'r').read()
|
| 389 |
+
}), 500
|
| 390 |
+
|
| 391 |
+
# Get the generated markdown and JSON
|
| 392 |
+
markdown_file = os.path.join(output_dir, f"{base_filename}.md")
|
| 393 |
+
json_file = os.path.join(output_dir, f"{base_filename}.json")
|
| 394 |
+
|
| 395 |
+
# Check if the output files exist
|
| 396 |
+
markdown_content = ""
|
| 397 |
+
json_content = {}
|
| 398 |
+
|
| 399 |
+
if os.path.exists(markdown_file):
|
| 400 |
+
with open(markdown_file, 'r', encoding='utf-8') as f:
|
| 401 |
+
markdown_content = f.read()
|
| 402 |
+
|
| 403 |
+
if os.path.exists(json_file):
|
| 404 |
+
with open(json_file, 'r', encoding='utf-8') as f:
|
| 405 |
+
json_content = json.load(f)
|
| 406 |
+
|
| 407 |
+
# Read the log file
|
| 408 |
+
with open(log_file, 'r') as f:
|
| 409 |
+
log_content = f.read()
|
| 410 |
+
|
| 411 |
+
# Copy the output files to a location accessible for download
|
| 412 |
+
output_markdown = os.path.join(job_dir, f"{base_filename}.md")
|
| 413 |
+
output_json = os.path.join(job_dir, f"{base_filename}.json")
|
| 414 |
+
|
| 415 |
+
if os.path.exists(markdown_file):
|
| 416 |
+
shutil.copy(markdown_file, output_markdown)
|
| 417 |
+
|
| 418 |
+
if os.path.exists(json_file):
|
| 419 |
+
shutil.copy(json_file, output_json)
|
| 420 |
+
|
| 421 |
+
# Return the conversion results
|
| 422 |
+
return jsonify({
|
| 423 |
+
"message": f"PDF '{filename}' converted successfully",
|
| 424 |
+
"markdown": markdown_content,
|
| 425 |
+
"json": json_content,
|
| 426 |
+
"log": log_content,
|
| 427 |
+
"base_filename": base_filename,
|
| 428 |
+
"job_id": job_id,
|
| 429 |
+
"markdown_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.md"),
|
| 430 |
+
"json_url": url_for('download_file', job_id=job_id, filename=f"{base_filename}.json")
|
| 431 |
+
})
|
| 432 |
+
|
| 433 |
+
except Exception as e:
|
| 434 |
+
import traceback
|
| 435 |
+
error_details = traceback.format_exc()
|
| 436 |
+
return jsonify({
|
| 437 |
+
"error": f"Failed to convert PDF: {str(e)}",
|
| 438 |
+
"details": error_details
|
| 439 |
+
}), 500
|
| 440 |
+
|
| 441 |
+
@app.route('/download/<job_id>/<filename>')
|
| 442 |
+
def download_file(job_id, filename):
|
| 443 |
+
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 444 |
+
return send_from_directory(job_dir, filename)
|
| 445 |
+
|
| 446 |
+
# Add a sample PDF for testing
|
| 447 |
+
@app.route('/sample')
|
| 448 |
+
def add_sample():
|
| 449 |
+
try:
|
| 450 |
+
# Create a tiny text-only PDF using Podofoimpose (if available) or other method
|
| 451 |
+
sample_dir = os.path.join(UPLOAD_FOLDER, 'sample')
|
| 452 |
+
os.makedirs(sample_dir, exist_ok=True)
|
| 453 |
+
sample_path = os.path.join(sample_dir, 'sample.pdf')
|
| 454 |
+
|
| 455 |
+
# Use simple text for the sample
|
| 456 |
+
with open(os.path.join(sample_dir, 'sample.txt'), 'w') as f:
|
| 457 |
+
f.write("This is a sample PDF for testing MinerU.\n\nIt contains simple text to demonstrate the PDF to Markdown and JSON conversion capabilities.")
|
| 458 |
+
|
| 459 |
+
# Try to convert the text to PDF if possible
|
| 460 |
+
try:
|
| 461 |
+
subprocess.run(['convert', '-size', '612x792', 'caption:@' + os.path.join(sample_dir, 'sample.txt'), sample_path])
|
| 462 |
+
except:
|
| 463 |
+
# If ImageMagick's convert fails, try another approach
|
| 464 |
+
return jsonify({"error": "Could not create sample PDF. Please upload your own PDF file."}), 500
|
| 465 |
+
|
| 466 |
+
return jsonify({"message": "Sample PDF created", "path": sample_path})
|
| 467 |
+
except Exception as e:
|
| 468 |
+
return jsonify({"error": f"Failed to create sample PDF: {str(e)}"}), 500
|
| 469 |
+
|
| 470 |
+
@app.route('/health')
|
| 471 |
+
def health_check():
|
| 472 |
+
"""
|
| 473 |
+
Health check endpoint for monitoring.
|
| 474 |
+
Returns basic information about the service status.
|
| 475 |
+
"""
|
| 476 |
+
try:
|
| 477 |
+
# Check if magic-pdf command exists
|
| 478 |
+
has_magic_pdf = False
|
| 479 |
+
try:
|
| 480 |
+
subprocess.run(['magic-pdf', '--version'], capture_output=True, check=False)
|
| 481 |
+
has_magic_pdf = True
|
| 482 |
+
except FileNotFoundError:
|
| 483 |
+
pass
|
| 484 |
+
|
| 485 |
+
# Get runtime information
|
| 486 |
+
health_info = {
|
| 487 |
+
'status': 'healthy',
|
| 488 |
+
'version': APP_VERSION,
|
| 489 |
+
'environment': {
|
| 490 |
+
'python_version': platform.python_version(),
|
| 491 |
+
'platform': platform.platform(),
|
| 492 |
+
'processor': platform.processor()
|
| 493 |
+
},
|
| 494 |
+
'configuration': {
|
| 495 |
+
'upload_folder_exists': os.path.exists(UPLOAD_FOLDER),
|
| 496 |
+
'output_folder_exists': os.path.exists(OUTPUT_FOLDER),
|
| 497 |
+
'magic_pdf_installed': has_magic_pdf
|
| 498 |
+
}
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
return jsonify(health_info)
|
| 502 |
+
except Exception as e:
|
| 503 |
+
return jsonify({
|
| 504 |
+
'status': 'unhealthy',
|
| 505 |
+
'error': str(e)
|
| 506 |
+
}), 500
|
| 507 |
+
|
| 508 |
+
@app.route('/api/convert', methods=['POST'])
|
| 509 |
+
def api_convert_pdf():
|
| 510 |
+
"""
|
| 511 |
+
API endpoint for programmatic access to PDF conversion.
|
| 512 |
+
|
| 513 |
+
Request:
|
| 514 |
+
- POST request with 'file' field containing PDF file
|
| 515 |
+
|
| 516 |
+
Response:
|
| 517 |
+
- JSON with conversion results
|
| 518 |
+
"""
|
| 519 |
+
# Validate request
|
| 520 |
+
if 'file' not in request.files:
|
| 521 |
+
return jsonify({
|
| 522 |
+
'success': False,
|
| 523 |
+
'error': 'No file provided. Please upload a PDF file.'
|
| 524 |
+
}), 400
|
| 525 |
+
|
| 526 |
+
file = request.files['file']
|
| 527 |
+
|
| 528 |
+
if file.filename == '':
|
| 529 |
+
return jsonify({
|
| 530 |
+
'success': False,
|
| 531 |
+
'error': 'No file selected. Please select a PDF file.'
|
| 532 |
+
}), 400
|
| 533 |
+
|
| 534 |
+
# Check if the file is a PDF
|
| 535 |
+
if not file.filename.lower().endswith('.pdf'):
|
| 536 |
+
return jsonify({
|
| 537 |
+
'success': False,
|
| 538 |
+
'error': 'Invalid file format. Please upload a PDF file.'
|
| 539 |
+
}), 400
|
| 540 |
+
|
| 541 |
+
# Generate a job ID
|
| 542 |
+
job_id = str(uuid.uuid4())
|
| 543 |
+
|
| 544 |
+
# Create job directory
|
| 545 |
+
job_dir = os.path.join(OUTPUT_FOLDER, job_id)
|
| 546 |
+
os.makedirs(job_dir, exist_ok=True)
|
| 547 |
+
|
| 548 |
+
# Save the uploaded file
|
| 549 |
+
filename = secure_filename(file.filename)
|
| 550 |
+
base_filename, _ = os.path.splitext(filename)
|
| 551 |
+
|
| 552 |
+
pdf_path = os.path.join(job_dir, filename)
|
| 553 |
+
file.save(pdf_path)
|
| 554 |
+
|
| 555 |
+
try:
|
| 556 |
+
# Define the default config dictionary first
|
| 557 |
+
default_config = {
|
| 558 |
+
"device-mode": "cpu",
|
| 559 |
+
"layout-config": {
|
| 560 |
+
"model": "doclayout_yolo",
|
| 561 |
+
"enable": True
|
| 562 |
+
},
|
| 563 |
+
"formula-config": {
|
| 564 |
+
"mfd_model": "yolo_v8_mfd",
|
| 565 |
+
"mfr_model": "unimernet_small",
|
| 566 |
+
"enable": True
|
| 567 |
+
},
|
| 568 |
+
"table-config": {
|
| 569 |
+
"model": "rapid_table",
|
| 570 |
+
"sub_model": "slanet_plus",
|
| 571 |
+
"enable": True,
|
| 572 |
+
"max_time": 400
|
| 573 |
+
}
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
# Create the magic-pdf.json configuration file in .config if it doesn't exist
|
| 577 |
+
config_dir = os.path.expanduser("~/.config/magic_pdf")
|
| 578 |
+
os.makedirs(config_dir, exist_ok=True)
|
| 579 |
+
config_file = os.path.join(config_dir, "magic-pdf.json")
|
| 580 |
+
|
| 581 |
+
if not os.path.exists(config_file):
|
| 582 |
+
with open(config_file, 'w') as f:
|
| 583 |
+
json.dump(default_config, f, indent=2)
|
| 584 |
+
|
| 585 |
+
# Also create the config in the home directory as fallback
|
| 586 |
+
home_config_file = os.path.join(os.path.expanduser("~"), "magic-pdf.json")
|
| 587 |
+
if not os.path.exists(home_config_file):
|
| 588 |
+
with open(home_config_file, 'w') as f:
|
| 589 |
+
json.dump(default_config, f, indent=2)
|
| 590 |
+
|
| 591 |
+
# Add a small delay to ensure config file is written before magic-pdf runs
|
| 592 |
+
time.sleep(0.5)
|
| 593 |
+
|
| 594 |
+
# Log the conversion process
|
| 595 |
+
log_file = os.path.join(job_dir, "conversion.log")
|
| 596 |
+
with open(log_file, "w") as log:
|
| 597 |
+
# Run the MinerU magic-pdf command with correct parameters
|
| 598 |
+
command = ["magic-pdf", "--path", pdf_path, "--output-dir", job_dir]
|
| 599 |
+
process = subprocess.Popen(
|
| 600 |
+
command,
|
| 601 |
+
stdout=subprocess.PIPE,
|
| 602 |
+
stderr=subprocess.STDOUT,
|
| 603 |
+
universal_newlines=True
|
| 604 |
+
)
|
| 605 |
+
|
| 606 |
+
output = []
|
| 607 |
+
for line in process.stdout:
|
| 608 |
+
output.append(line)
|
| 609 |
+
log.write(line)
|
| 610 |
+
log.flush()
|
| 611 |
+
|
| 612 |
+
process.wait()
|
| 613 |
+
exit_code = process.returncode
|
| 614 |
+
|
| 615 |
+
if exit_code != 0:
|
| 616 |
+
error_message = ''.join(output) if output else "Unknown error during PDF conversion"
|
| 617 |
+
return jsonify({
|
| 618 |
+
'success': False,
|
| 619 |
+
'error': 'PDF conversion failed. Please check the log for details.',
|
| 620 |
+
'log': error_message,
|
| 621 |
+
'exit_code': exit_code
|
| 622 |
+
}), 500
|
| 623 |
+
|
| 624 |
+
# Check for output files
|
| 625 |
+
markdown_file = os.path.join(job_dir, f"{base_filename}.md")
|
| 626 |
+
json_file = os.path.join(job_dir, f"{base_filename}.json")
|
| 627 |
+
|
| 628 |
+
# If files don't exist in the job directory, check the same directory as the PDF
|
| 629 |
+
pdf_dir = os.path.dirname(pdf_path)
|
| 630 |
+
if not os.path.exists(markdown_file):
|
| 631 |
+
alt_markdown_file = os.path.join(pdf_dir, f"{base_filename}.md")
|
| 632 |
+
if os.path.exists(alt_markdown_file):
|
| 633 |
+
markdown_file = alt_markdown_file
|
| 634 |
+
else:
|
| 635 |
+
# Try to find any markdown file in the output directory
|
| 636 |
+
md_files = [f for f in os.listdir(job_dir) if f.endswith('.md')]
|
| 637 |
+
if md_files:
|
| 638 |
+
markdown_file = os.path.join(job_dir, md_files[0])
|
| 639 |
+
|
| 640 |
+
if not os.path.exists(json_file):
|
| 641 |
+
alt_json_file = os.path.join(pdf_dir, f"{base_filename}.json")
|
| 642 |
+
if os.path.exists(alt_json_file):
|
| 643 |
+
json_file = alt_json_file
|
| 644 |
+
else:
|
| 645 |
+
# Try to find any JSON file in the output directory
|
| 646 |
+
json_files = [f for f in os.listdir(job_dir) if f.endswith('.json')]
|
| 647 |
+
if json_files:
|
| 648 |
+
json_file = os.path.join(job_dir, json_files[0])
|
| 649 |
+
|
| 650 |
+
# Read markdown content
|
| 651 |
+
markdown_content = ""
|
| 652 |
+
if os.path.exists(markdown_file):
|
| 653 |
+
with open(markdown_file, 'r', encoding='utf-8') as f:
|
| 654 |
+
markdown_content = f.read()
|
| 655 |
+
else:
|
| 656 |
+
print(f"Warning: Markdown file not found at {markdown_file}")
|
| 657 |
+
|
| 658 |
+
# Read JSON content
|
| 659 |
+
json_content = {}
|
| 660 |
+
if os.path.exists(json_file):
|
| 661 |
+
with open(json_file, 'r', encoding='utf-8') as f:
|
| 662 |
+
json_content = json.load(f)
|
| 663 |
+
else:
|
| 664 |
+
print(f"Warning: JSON file not found at {json_file}")
|
| 665 |
+
|
| 666 |
+
# Read log content
|
| 667 |
+
log_content = ""
|
| 668 |
+
with open(log_file, 'r', encoding='utf-8') as f:
|
| 669 |
+
log_content = f.read()
|
| 670 |
+
|
| 671 |
+
# Create the result
|
| 672 |
+
result = {
|
| 673 |
+
'success': True,
|
| 674 |
+
'message': 'PDF conversion successful',
|
| 675 |
+
'job_id': job_id,
|
| 676 |
+
'base_filename': base_filename,
|
| 677 |
+
'file_info': {
|
| 678 |
+
'original_filename': filename,
|
| 679 |
+
'size_bytes': os.path.getsize(pdf_path),
|
| 680 |
+
'content_type': 'application/pdf'
|
| 681 |
+
},
|
| 682 |
+
'markdown': markdown_content,
|
| 683 |
+
'json': json_content,
|
| 684 |
+
'log': log_content,
|
| 685 |
+
'files': {
|
| 686 |
+
'markdown_path': os.path.basename(markdown_file) if os.path.exists(markdown_file) else None,
|
| 687 |
+
'json_path': os.path.basename(json_file) if os.path.exists(json_file) else None
|
| 688 |
+
}
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
return jsonify(result)
|
| 692 |
+
|
| 693 |
+
except Exception as e:
|
| 694 |
+
import traceback
|
| 695 |
+
error_details = traceback.format_exc()
|
| 696 |
+
|
| 697 |
+
return jsonify({
|
| 698 |
+
'success': False,
|
| 699 |
+
'error': f'An error occurred during PDF conversion: {str(e)}',
|
| 700 |
+
'details': error_details,
|
| 701 |
+
'job_id': job_id
|
| 702 |
+
}), 500
|
| 703 |
+
|
| 704 |
+
if __name__ == '__main__':
|
| 705 |
+
app.run(host='0.0.0.0', port=7860, debug=False)
|
entrypoint.sh
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
|
| 4 |
+
# Activate virtual environment
|
| 5 |
+
source /opt/mineru_venv/bin/activate
|
| 6 |
+
|
| 7 |
+
# Display GPU information
|
| 8 |
+
echo "Checking NVIDIA GPU status:"
|
| 9 |
+
nvidia-smi
|
| 10 |
+
|
| 11 |
+
# Display MinerU version
|
| 12 |
+
echo "MinerU version:"
|
| 13 |
+
magic-pdf --version
|
| 14 |
+
|
| 15 |
+
# Create a samples directory
|
| 16 |
+
mkdir -p $HOME/.config/magic_pdf
|
| 17 |
+
mkdir -p /app/samples || mkdir -p /tmp/samples
|
| 18 |
+
|
| 19 |
+
# Define the samples directory based on what's writable
|
| 20 |
+
if [ -w "/app/samples" ]; then
|
| 21 |
+
SAMPLES_DIR="/app/samples"
|
| 22 |
+
else
|
| 23 |
+
SAMPLES_DIR="/tmp/samples"
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
# Download a sample PDF for testing if it doesn't exist
|
| 27 |
+
if [ ! -f "$SAMPLES_DIR/sample.pdf" ]; then
|
| 28 |
+
echo "Downloading sample PDF for testing..."
|
| 29 |
+
# Download a simple paper from arXiv (using a small one for quick processing)
|
| 30 |
+
wget -q "https://arxiv.org/pdf/2201.08239.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
|
| 31 |
+
|
| 32 |
+
# If that fails, try another source
|
| 33 |
+
if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
|
| 34 |
+
wget -q "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" -O "$SAMPLES_DIR/sample.pdf" || true
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
# If both fail, create a simple PDF with text
|
| 38 |
+
if [ ! -s "$SAMPLES_DIR/sample.pdf" ]; then
|
| 39 |
+
echo "Failed to download sample PDF, creating a simple PDF text file..."
|
| 40 |
+
echo "This is a sample PDF document for testing MinerU.
|
| 41 |
+
|
| 42 |
+
MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
|
| 43 |
+
|
| 44 |
+
This file was created for testing purposes." > "$SAMPLES_DIR/sample.txt"
|
| 45 |
+
|
| 46 |
+
# Try using different methods to create a PDF
|
| 47 |
+
if command -v convert &> /dev/null; then
|
| 48 |
+
convert -size 612x792 -background white -fill black caption:@"$SAMPLES_DIR/sample.txt" "$SAMPLES_DIR/sample.pdf"
|
| 49 |
+
else
|
| 50 |
+
echo "WARNING: Could not create a sample PDF file automatically."
|
| 51 |
+
fi
|
| 52 |
+
fi
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
# Create the magic-pdf.json config file
|
| 56 |
+
CONFIG_DIR="$HOME/.config/magic_pdf"
|
| 57 |
+
mkdir -p "$CONFIG_DIR"
|
| 58 |
+
if [ ! -f "$CONFIG_DIR/magic-pdf.json" ]; then
|
| 59 |
+
echo "Creating magic-pdf.json configuration file..."
|
| 60 |
+
cat > "$CONFIG_DIR/magic-pdf.json" << EOF
|
| 61 |
+
{
|
| 62 |
+
"device-mode": "gpu",
|
| 63 |
+
"layout-config": {
|
| 64 |
+
"model": "doclayout_yolo",
|
| 65 |
+
"enable": true
|
| 66 |
+
},
|
| 67 |
+
"formula-config": {
|
| 68 |
+
"mfd_model": "yolo_v8_mfd",
|
| 69 |
+
"mfr_model": "unimernet_small",
|
| 70 |
+
"enable": true
|
| 71 |
+
},
|
| 72 |
+
"table-config": {
|
| 73 |
+
"model": "rapid_table",
|
| 74 |
+
"sub_model": "slanet_plus",
|
| 75 |
+
"enable": true,
|
| 76 |
+
"max_time": 400
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
EOF
|
| 80 |
+
fi
|
| 81 |
+
|
| 82 |
+
# Start the Flask application if it exists, otherwise provide a shell
|
| 83 |
+
if [ -f "/app/app.py" ]; then
|
| 84 |
+
echo "Starting Flask application..."
|
| 85 |
+
python /app/app.py
|
| 86 |
+
else
|
| 87 |
+
echo "No app.py found. Starting a simple server..."
|
| 88 |
+
# Create a simple server that shows MinerU is installed
|
| 89 |
+
TMP_APP_PATH="$HOME/simple_app.py"
|
| 90 |
+
cat > "$TMP_APP_PATH" << 'EOF'
|
| 91 |
+
from flask import Flask, request, jsonify, render_template_string
|
| 92 |
+
|
| 93 |
+
app = Flask(__name__)
|
| 94 |
+
|
| 95 |
+
HTML_TEMPLATE = """
|
| 96 |
+
<!DOCTYPE html>
|
| 97 |
+
<html>
|
| 98 |
+
<head>
|
| 99 |
+
<title>MinerU PDF Processing</title>
|
| 100 |
+
<style>
|
| 101 |
+
body {
|
| 102 |
+
font-family: Arial, sans-serif;
|
| 103 |
+
max-width: 800px;
|
| 104 |
+
margin: 0 auto;
|
| 105 |
+
padding: 20px;
|
| 106 |
+
}
|
| 107 |
+
.container {
|
| 108 |
+
background-color: #f9f9f9;
|
| 109 |
+
padding: 20px;
|
| 110 |
+
border-radius: 8px;
|
| 111 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 112 |
+
}
|
| 113 |
+
h1 {
|
| 114 |
+
color: #2c3e50;
|
| 115 |
+
}
|
| 116 |
+
pre {
|
| 117 |
+
background-color: #f1f1f1;
|
| 118 |
+
padding: 10px;
|
| 119 |
+
border-radius: 4px;
|
| 120 |
+
overflow-x: auto;
|
| 121 |
+
}
|
| 122 |
+
.command {
|
| 123 |
+
font-family: monospace;
|
| 124 |
+
background-color: #eee;
|
| 125 |
+
padding: 5px;
|
| 126 |
+
border-radius: 3px;
|
| 127 |
+
}
|
| 128 |
+
</style>
|
| 129 |
+
</head>
|
| 130 |
+
<body>
|
| 131 |
+
<div class="container">
|
| 132 |
+
<h1>MinerU PDF Processing Service</h1>
|
| 133 |
+
<p>This Space provides PDF processing capabilities using MinerU.</p>
|
| 134 |
+
|
| 135 |
+
<h2>GPU Status</h2>
|
| 136 |
+
<pre id="gpuStatus">Loading...</pre>
|
| 137 |
+
|
| 138 |
+
<h2>Available Commands</h2>
|
| 139 |
+
<p>MinerU provides the following commands:</p>
|
| 140 |
+
<p><span class="command">magic-pdf</span> - Process PDF documents</p>
|
| 141 |
+
|
| 142 |
+
<h2>Help Output</h2>
|
| 143 |
+
<pre id="helpOutput">Loading...</pre>
|
| 144 |
+
</div>
|
| 145 |
+
|
| 146 |
+
<script>
|
| 147 |
+
// Fetch GPU status
|
| 148 |
+
fetch('/gpu-status')
|
| 149 |
+
.then(response => response.json())
|
| 150 |
+
.then(data => {
|
| 151 |
+
document.getElementById('gpuStatus').textContent = data.output;
|
| 152 |
+
})
|
| 153 |
+
.catch(error => {
|
| 154 |
+
document.getElementById('gpuStatus').textContent = 'Error fetching GPU status: ' + error.message;
|
| 155 |
+
});
|
| 156 |
+
|
| 157 |
+
// Fetch help output
|
| 158 |
+
fetch('/help-output')
|
| 159 |
+
.then(response => response.json())
|
| 160 |
+
.then(data => {
|
| 161 |
+
document.getElementById('helpOutput').textContent = data.output;
|
| 162 |
+
})
|
| 163 |
+
.catch(error => {
|
| 164 |
+
document.getElementById('helpOutput').textContent = 'Error fetching help: ' + error.message;
|
| 165 |
+
});
|
| 166 |
+
</script>
|
| 167 |
+
</body>
|
| 168 |
+
</html>
|
| 169 |
+
"""
|
| 170 |
+
|
| 171 |
+
@app.route('/')
|
| 172 |
+
def index():
|
| 173 |
+
return render_template_string(HTML_TEMPLATE)
|
| 174 |
+
|
| 175 |
+
@app.route('/gpu-status')
|
| 176 |
+
def gpu_status():
|
| 177 |
+
import subprocess
|
| 178 |
+
try:
|
| 179 |
+
output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 180 |
+
except subprocess.CalledProcessError as e:
|
| 181 |
+
output = f"Error running nvidia-smi: {e.output.decode('utf-8')}"
|
| 182 |
+
except FileNotFoundError:
|
| 183 |
+
output = "nvidia-smi command not found. GPU may not be available."
|
| 184 |
+
return jsonify({"output": output})
|
| 185 |
+
|
| 186 |
+
@app.route('/help-output')
|
| 187 |
+
def help_output():
|
| 188 |
+
import subprocess
|
| 189 |
+
try:
|
| 190 |
+
output = subprocess.check_output(['magic-pdf', '--help'], stderr=subprocess.STDOUT).decode('utf-8')
|
| 191 |
+
except subprocess.CalledProcessError as e:
|
| 192 |
+
output = f"Error running magic-pdf --help: {e.output.decode('utf-8')}"
|
| 193 |
+
except FileNotFoundError:
|
| 194 |
+
output = "magic-pdf command not found. MinerU may not be installed correctly."
|
| 195 |
+
return jsonify({"output": output})
|
| 196 |
+
|
| 197 |
+
if __name__ == '__main__':
|
| 198 |
+
app.run(host='0.0.0.0', port=7860)
|
| 199 |
+
EOF
|
| 200 |
+
|
| 201 |
+
python "$TMP_APP_PATH"
|
| 202 |
+
fi
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask==2.3.3
|
| 2 |
+
transformers>=4.37.0
|
| 3 |
+
torch>=2.0.0
|
| 4 |
+
sentencepiece>=0.1.99
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
accelerate>=0.25.0
|
| 7 |
+
einops>=0.6.0
|
| 8 |
+
packaging>=23.0
|
| 9 |
+
werkzeug>=2.3.0
|
| 10 |
+
flask-cors>=4.0.0
|
space_config.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"runtime": "docker",
|
| 3 |
+
"hardware": "nvidia-l4-1x-16gb"
|
| 4 |
+
}
|