marcosremar2 commited on
Commit
ff60d55
·
1 Parent(s): f32ce28

Fix entrypoint script to properly install and verify magic-pdf tool

Browse files
Files changed (1) hide show
  1. entrypoint.sh +90 -148
entrypoint.sh CHANGED
@@ -1,169 +1,111 @@
1
  #!/bin/bash
2
  set -e
3
 
4
- # Activate virtual environment
5
- source /opt/mineru_venv/bin/activate
6
-
7
- # Display GPU information
8
- echo "===== Application Startup at $(date +'%Y-%m-%d %H:%M:%S') ====="
9
- echo ""
10
- echo "Checking NVIDIA GPU status:"
11
- nvidia-smi || echo "No NVIDIA GPU detected, running in CPU mode"
12
-
13
- # Display MinerU version
14
- echo "MinerU version:"
15
- magic-pdf --version || echo "Error: MinerU magic-pdf not found"
16
-
17
- # Ensure output directories have proper permissions
18
- echo "Checking output directories..."
19
- for DIR in "/tmp/pdf_uploads" "/tmp/pdf_output" "/tmp/models"; do
20
- if [ -d "$DIR" ]; then
21
- echo "Ensuring $DIR is writable..."
22
- chmod -R 777 "$DIR" || echo "Could not set permissions on $DIR"
23
- else
24
- echo "Creating directory $DIR..."
25
- mkdir -p "$DIR" || echo "Could not create $DIR"
26
- chmod -R 777 "$DIR" || echo "Could not set permissions on $DIR"
27
- fi
28
- done
29
-
30
- # Fallback to user's home directory if tmp is not writable
31
- if ! touch "/tmp/test_write_permission" 2>/dev/null; then
32
- echo "Warning: /tmp directory is not writable, using $HOME instead"
33
- export UPLOAD_FOLDER="$HOME/pdf_uploads"
34
- export OUTPUT_FOLDER="$HOME/pdf_output"
35
- mkdir -p "$UPLOAD_FOLDER" "$OUTPUT_FOLDER"
36
- echo "Using alternative upload folder: $UPLOAD_FOLDER"
37
- echo "Using alternative output folder: $OUTPUT_FOLDER"
38
- else
39
- rm -f "/tmp/test_write_permission"
40
- export UPLOAD_FOLDER="/tmp/pdf_uploads"
41
- export OUTPUT_FOLDER="/tmp/pdf_output"
42
- fi
43
-
44
- # Create directories for models if they don't exist
45
  mkdir -p /tmp/models/MFD/YOLO
46
  mkdir -p /tmp/models/MFR/unimernet
47
  mkdir -p /tmp/models/table/rapid
48
  mkdir -p /tmp/models/layout/doclayout
49
 
50
- # Check if model files exist, if not, download them
51
- echo "Checking model files..."
52
- MODEL_FILES=(
53
- "/tmp/models/MFD/YOLO/yolo_v8_ft.pt"
54
- "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt"
55
- "/tmp/models/MFR/unimernet/unimernet_small.pth"
56
- "/tmp/models/table/rapid/rapid_table.pt"
57
- "/tmp/models/table/rapid/slanet_plus.pt"
58
- "/tmp/models/layout/doclayout/doclayout_yolo.pt"
59
- )
60
-
61
- MODELS_REPO="https://huggingface.co/opendatalab/MinerU/resolve/main/models"
62
- MODEL_URLS=(
63
- "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
64
- "${MODELS_REPO}/mfd/yolo_v8_mfd.pt"
65
- "${MODELS_REPO}/mfr/unimernet_small.pth"
66
- "${MODELS_REPO}/table/rapid_table.pt"
67
- "${MODELS_REPO}/table/slanet_plus.pt"
68
- "${MODELS_REPO}/layout/doclayout_yolo.pt"
69
- )
70
 
71
- for i in "${!MODEL_FILES[@]}"; do
72
- if [ ! -f "${MODEL_FILES[$i]}" ] || [ ! -s "${MODEL_FILES[$i]}" ]; then
73
- echo "Downloading ${MODEL_FILES[$i]}..."
74
- curl -L --retry 5 --retry-delay 5 -o "${MODEL_FILES[$i]}" "${MODEL_URLS[$i]}" || echo "Failed to download ${MODEL_FILES[$i]}"
75
-
76
- # Verify file size
77
- if [ -f "${MODEL_FILES[$i]}" ]; then
78
- SIZE=$(stat -c%s "${MODEL_FILES[$i]}" 2>/dev/null || stat -f%z "${MODEL_FILES[$i]}")
79
- if [ "$SIZE" -eq 0 ]; then
80
- echo "Warning: Downloaded file ${MODEL_FILES[$i]} is empty!"
81
- else
82
- echo "${MODEL_FILES[$i]} downloaded successfully (${SIZE} bytes)"
83
- fi
84
- fi
85
  else
86
- SIZE=$(stat -c%s "${MODEL_FILES[$i]}" 2>/dev/null || stat -f%z "${MODEL_FILES[$i]}")
87
- echo "${MODEL_FILES[$i]} already exists (${SIZE} bytes)."
88
  fi
89
- done
90
-
91
- # Create a samples directory
92
- mkdir -p $HOME/samples
93
- mkdir -p $HOME/.config/magic_pdf
94
 
95
- # Download a sample PDF for testing if it doesn't exist
96
- echo "Downloading sample PDF for testing..."
97
- if [ ! -f "$HOME/samples/sample.pdf" ]; then
98
- # Download a simple paper from arXiv (using a small one for quick processing)
99
- curl -L --retry 3 --retry-delay 3 -o "$HOME/samples/sample.pdf" "https://arxiv.org/pdf/2201.08239.pdf" || true
100
-
101
- # If that fails, try another source
102
- if [ ! -s "$HOME/samples/sample.pdf" ]; then
103
- curl -L --retry 3 -o "$HOME/samples/sample.pdf" "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" || true
104
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
- # If both fail, create a simple PDF with text
107
- if [ ! -s "$HOME/samples/sample.pdf" ]; then
108
- echo "Failed to download sample PDF, creating a simple PDF text file..."
109
- echo "This is a sample PDF document for testing MinerU.
110
-
111
- MinerU is a high-quality tool for converting PDF to Markdown and JSON formats.
112
-
113
- This file was created for testing purposes." > "$HOME/samples/sample.txt"
 
 
 
 
 
 
 
 
114
 
115
- # Try using different methods to create a PDF
116
- if command -v convert &> /dev/null; then
117
- convert -size 612x792 -background white -fill black caption:@"$HOME/samples/sample.txt" "$HOME/samples/sample.pdf"
118
- else
119
- echo "WARNING: Could not create a sample PDF file automatically."
120
- fi
121
  fi
122
- fi
123
-
124
- # Create the magic-pdf.json config file with paths to the downloaded models
125
- echo "Creating magic-pdf.json configuration file..."
126
- cat > "$HOME/.config/magic_pdf/magic-pdf.json" << EOF
127
- {
128
- "device-mode": "gpu",
129
- "models-path": "/tmp/models",
130
- "layout-config": {
131
- "model": "doclayout_yolo",
132
- "model_path": "/tmp/models/layout/doclayout/doclayout_yolo.pt",
133
- "enable": true
134
- },
135
- "formula-config": {
136
- "mfd_model": "yolo_v8_mfd",
137
- "mfd_model_path": "/tmp/models/MFD/YOLO/yolo_v8_mfd.pt",
138
- "mfr_model": "unimernet_small",
139
- "mfr_model_path": "/tmp/models/MFR/unimernet/unimernet_small.pth",
140
- "enable": true
141
- },
142
- "table-config": {
143
- "model": "rapid_table",
144
- "model_path": "/tmp/models/table/rapid/rapid_table.pt",
145
- "sub_model": "slanet_plus",
146
- "sub_model_path": "/tmp/models/table/rapid/slanet_plus.pt",
147
- "enable": true,
148
- "max_time": 400
149
- }
150
  }
151
- EOF
152
 
153
- # Also create it in the home directory as some versions of MinerU look for it there
154
- cp "$HOME/.config/magic_pdf/magic-pdf.json" "$HOME/magic-pdf.json"
 
 
 
155
 
156
- # List model files to verify they're present
157
- echo "Verifying model files:"
158
- ls -la /tmp/models/MFD/YOLO/ || echo "YOLO models directory issue"
159
- ls -la /tmp/models/MFR/unimernet/ || echo "UniMERNet models directory issue"
160
- ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
161
- ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
162
-
163
- # Check file sizes to ensure they're not empty
164
- echo "Checking model file sizes:"
165
- find /tmp/models -type f -size 0c -exec echo "Warning: Empty file {}" \;
166
 
167
  # Start the Flask application
168
  echo "Starting Flask application..."
169
- python /app/app.py
 
1
  #!/bin/bash
2
  set -e
3
 
4
+ # Show environment information
5
+ echo "Starting entrypoint.sh..."
6
+ echo "User: $(whoami)"
7
+ echo "Current directory: $(pwd)"
8
+ echo "Directory listing:"
9
+ ls -la
10
+
11
+ # Create necessary directories
12
+ mkdir -p /tmp/pdf_uploads
13
+ mkdir -p /tmp/pdf_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  mkdir -p /tmp/models/MFD/YOLO
15
  mkdir -p /tmp/models/MFR/unimernet
16
  mkdir -p /tmp/models/table/rapid
17
  mkdir -p /tmp/models/layout/doclayout
18
 
19
+ # Set permissions (ensure directories are writable)
20
+ chmod -R 777 /tmp/pdf_uploads
21
+ chmod -R 777 /tmp/pdf_output
22
+ chmod -R 777 /tmp/models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ # Check if magic-pdf command exists
25
+ if ! command -v magic-pdf &> /dev/null; then
26
+ echo "Warning: magic-pdf command not found. Installing MinerU PDF processing tools..."
27
+ pip install --no-cache-dir minerupdf
28
+
29
+ # Verify installation
30
+ if command -v magic-pdf &> /dev/null; then
31
+ echo "Successfully installed magic-pdf."
 
 
 
 
 
 
32
  else
33
+ echo "Failed to install magic-pdf. Will use PyMuPDF fallback."
34
+ pip install --no-cache-dir pymupdf
35
  fi
36
+ else
37
+ echo "magic-pdf command is already installed."
38
+ fi
 
 
39
 
40
+ # Configure magic-pdf
41
+ mkdir -p ~/.config/magic_pdf
42
+ cat > ~/.config/magic_pdf/magic-pdf.json << EOL
43
+ {
44
+ "device-mode": "cpu",
45
+ "layout-config": {
46
+ "model": "doclayout_yolo",
47
+ "enable": true
48
+ },
49
+ "formula-config": {
50
+ "mfd_model": "yolo_v8_mfd",
51
+ "mfr_model": "unimernet_small",
52
+ "enable": true
53
+ },
54
+ "table-config": {
55
+ "model": "rapid_table",
56
+ "sub_model": "slanet_plus",
57
+ "enable": true,
58
+ "max_time": 400
59
+ }
60
+ }
61
+ EOL
62
+
63
+ # Download required model files if they don't exist or are empty
64
+ function download_model() {
65
+ local model_path=$1
66
+ local model_url=$2
67
+ local max_attempts=3
68
+ local attempt=1
69
 
70
+ if [ ! -f "$model_path" ] || [ ! -s "$model_path" ]; then
71
+ echo "Downloading model to $model_path from $model_url"
72
+ while [ $attempt -le $max_attempts ]; do
73
+ echo "Download attempt $attempt of $max_attempts..."
74
+ curl -L "$model_url" --output "$model_path" --retry 3 --retry-delay 2
75
+
76
+ if [ -f "$model_path" ] && [ -s "$model_path" ]; then
77
+ echo "Successfully downloaded model to $model_path ($(du -h "$model_path" | cut -f1) used)"
78
+ return 0
79
+ fi
80
+
81
+ echo "Download failed or file is empty. Retrying..."
82
+ rm -f "$model_path"
83
+ attempt=$((attempt+1))
84
+ sleep 2
85
+ done
86
 
87
+ echo "Failed to download model after $max_attempts attempts"
88
+ return 1
89
+ else
90
+ echo "Model already exists at $model_path ($(du -h "$model_path" | cut -f1) used)"
91
+ return 0
 
92
  fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  }
 
94
 
95
+ # Download models
96
+ download_model "/tmp/models/MFD/YOLO/yolo_v8_ft.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/yolo_v8_ft.pt"
97
+ download_model "/tmp/models/MFR/unimernet/unimernet_small.pth" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/unimernet_small.pth"
98
+ download_model "/tmp/models/table/rapid/rapid_table.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/rapid_table.pt"
99
+ download_model "/tmp/models/layout/doclayout/doclayout_yolo.pt" "https://huggingface.co/marcosremar2/mineru-models/resolve/main/doclayout_yolo.pt"
100
 
101
+ # Verify magic-pdf installation and functionality
102
+ if command -v magic-pdf &> /dev/null; then
103
+ echo "Testing magic-pdf command..."
104
+ magic-pdf --version || echo "magic-pdf command exists but may not be functioning properly"
105
+ else
106
+ echo "Warning: magic-pdf command still not available. Will use PyMuPDF as fallback."
107
+ fi
 
 
 
108
 
109
  # Start the Flask application
110
  echo "Starting Flask application..."
111
+ python app.py