Spaces:
Runtime error
Runtime error
Commit ·
684ccd8
1
Parent(s): 128728a
Fix permission issues and improve model downloading
Browse files- Dockerfile +42 -32
- app.py +82 -8
- entrypoint.sh +47 -5
Dockerfile
CHANGED
|
@@ -26,6 +26,7 @@ RUN apt-get update && apt-get install -y \
|
|
| 26 |
libxrender-dev \
|
| 27 |
pkg-config \
|
| 28 |
libcairo2-dev \
|
|
|
|
| 29 |
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
|
| 31 |
# Configure ImageMagick policy to allow PDF conversion (needed for sample PDF creation)
|
|
@@ -53,49 +54,58 @@ RUN pip install --no-cache-dir -e ".[full]"
|
|
| 53 |
# Install additional dependencies for the web application
|
| 54 |
RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
|
| 55 |
|
| 56 |
-
# Create
|
| 57 |
-
RUN
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
RUN mkdir -p /tmp/
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Download model weights
|
| 67 |
-
RUN echo "Downloading MinerU model weights..."
|
| 68 |
WORKDIR /tmp/models
|
| 69 |
|
| 70 |
-
# Download the YOLO model for formula detection
|
| 71 |
-
RUN
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
RUN mkdir -p /app/samples && chown -R user:user /app
|
| 88 |
|
| 89 |
-
# Set permissions for model files
|
| 90 |
-
RUN chown -R user:user /tmp/models
|
| 91 |
-
|
| 92 |
# Copy the application files
|
| 93 |
WORKDIR /app
|
| 94 |
COPY . /app/
|
| 95 |
|
| 96 |
# Fix permissions for the user
|
| 97 |
-
RUN chown -R user:user /app
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# Expose the port
|
| 101 |
EXPOSE 7860
|
|
|
|
| 26 |
libxrender-dev \
|
| 27 |
pkg-config \
|
| 28 |
libcairo2-dev \
|
| 29 |
+
sudo \
|
| 30 |
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
|
| 32 |
# Configure ImageMagick policy to allow PDF conversion (needed for sample PDF creation)
|
|
|
|
| 54 |
# Install additional dependencies for the web application
|
| 55 |
RUN pip install --no-cache-dir flask==2.3.3 flask-cors==4.0.0 werkzeug==2.3.7
|
| 56 |
|
| 57 |
+
# Create a non-root user for Hugging Face Spaces
|
| 58 |
+
RUN useradd -m -u 1000 user && \
|
| 59 |
+
echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/user && \
|
| 60 |
+
chmod 0440 /etc/sudoers.d/user
|
| 61 |
+
|
| 62 |
+
# Create directories with proper permissions
|
| 63 |
+
RUN mkdir -p /tmp/pdf_uploads /tmp/pdf_output && \
|
| 64 |
+
chmod -R 777 /tmp/pdf_uploads && \
|
| 65 |
+
chmod -R 777 /tmp/pdf_output && \
|
| 66 |
+
mkdir -p /tmp/samples && \
|
| 67 |
+
chmod -R 777 /tmp/samples
|
| 68 |
+
|
| 69 |
+
# Create models directory structure with proper permissions
|
| 70 |
+
RUN mkdir -p /tmp/models/MFD/YOLO && \
|
| 71 |
+
mkdir -p /tmp/models/MFR/unimernet && \
|
| 72 |
+
mkdir -p /tmp/models/table/rapid && \
|
| 73 |
+
mkdir -p /tmp/models/layout/doclayout && \
|
| 74 |
+
chmod -R 777 /tmp/models
|
| 75 |
|
| 76 |
# Download model weights
|
|
|
|
| 77 |
WORKDIR /tmp/models
|
| 78 |
|
| 79 |
+
# Download the YOLO model for formula detection (using curl with progress and retry)
|
| 80 |
+
RUN apt-get update && apt-get install -y curl && \
|
| 81 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/MFD/YOLO/yolo_v8_ft.pt https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt && \
|
| 82 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/MFD/YOLO/yolo_v8_mfd.pt https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfd/yolo_v8_mfd.pt && \
|
| 83 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/MFR/unimernet/unimernet_small.pth https://huggingface.co/opendatalab/MinerU/resolve/main/models/mfr/unimernet_small.pth && \
|
| 84 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/table/rapid/rapid_table.pt https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/rapid_table.pt && \
|
| 85 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/table/rapid/slanet_plus.pt https://huggingface.co/opendatalab/MinerU/resolve/main/models/table/slanet_plus.pt && \
|
| 86 |
+
curl -L --retry 5 --retry-delay 5 -o /tmp/models/layout/doclayout/doclayout_yolo.pt https://huggingface.co/opendatalab/MinerU/resolve/main/models/layout/doclayout_yolo.pt && \
|
| 87 |
+
ls -la /tmp/models/MFD/YOLO/ && \
|
| 88 |
+
ls -la /tmp/models/MFR/unimernet/ && \
|
| 89 |
+
ls -la /tmp/models/table/rapid/ && \
|
| 90 |
+
ls -la /tmp/models/layout/doclayout/
|
| 91 |
+
|
| 92 |
+
# Verify file sizes (files should not be empty)
|
| 93 |
+
RUN find /tmp/models -type f -size 0 -exec echo "Warning: Empty file {}" \;
|
| 94 |
+
|
| 95 |
+
# Create app directory
|
| 96 |
RUN mkdir -p /app/samples && chown -R user:user /app
|
| 97 |
|
|
|
|
|
|
|
|
|
|
| 98 |
# Copy the application files
|
| 99 |
WORKDIR /app
|
| 100 |
COPY . /app/
|
| 101 |
|
| 102 |
# Fix permissions for the user
|
| 103 |
+
RUN chown -R user:user /app && \
|
| 104 |
+
mkdir -p /home/user/.config/magic_pdf && \
|
| 105 |
+
chown -R user:user /home/user/.config && \
|
| 106 |
+
chown -R user:user /tmp/pdf_uploads && \
|
| 107 |
+
chown -R user:user /tmp/pdf_output && \
|
| 108 |
+
chown -R user:user /tmp/models
|
| 109 |
|
| 110 |
# Expose the port
|
| 111 |
EXPOSE 7860
|
app.py
CHANGED
|
@@ -9,12 +9,15 @@ import tempfile
|
|
| 9 |
import json
|
| 10 |
import time
|
| 11 |
import shutil
|
|
|
|
| 12 |
|
| 13 |
app = Flask(__name__)
|
| 14 |
CORS(app)
|
| 15 |
|
| 16 |
# Configure logging
|
| 17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Constants
|
| 20 |
UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
|
|
@@ -22,9 +25,34 @@ OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', '/tmp/pdf_output')
|
|
| 22 |
ALLOWED_EXTENSIONS = {'pdf'}
|
| 23 |
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 24 |
|
| 25 |
-
# Ensure the directories exist
|
| 26 |
-
|
| 27 |
-
os.makedirs(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
# Function to check if file extension is allowed
|
| 30 |
def allowed_file(filename):
|
|
@@ -112,6 +140,13 @@ HTML_TEMPLATE = """
|
|
| 112 |
a:hover {
|
| 113 |
text-decoration: underline;
|
| 114 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
</style>
|
| 116 |
</head>
|
| 117 |
<body>
|
|
@@ -156,6 +191,12 @@ HTML_TEMPLATE = """
|
|
| 156 |
curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert
|
| 157 |
</div>
|
| 158 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
</div>
|
| 160 |
|
| 161 |
<script>
|
|
@@ -237,7 +278,10 @@ HTML_TEMPLATE = """
|
|
| 237 |
# Route for the main page
|
| 238 |
@app.route('/')
|
| 239 |
def index():
|
| 240 |
-
return render_template_string(HTML_TEMPLATE,
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
# Route for the health check
|
| 243 |
@app.route('/health')
|
|
@@ -263,11 +307,26 @@ def health_check():
|
|
| 263 |
"doclayout_model": os.path.exists("/tmp/models/layout/doclayout/doclayout_yolo.pt")
|
| 264 |
}
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
return jsonify({
|
| 267 |
"status": "healthy",
|
| 268 |
"version": version,
|
| 269 |
"gpu_available": gpu_available,
|
| 270 |
"model_dirs": model_dirs,
|
|
|
|
| 271 |
"timestamp": time.time()
|
| 272 |
})
|
| 273 |
except Exception as e:
|
|
@@ -314,12 +373,26 @@ def convert_pdf():
|
|
| 314 |
# Create a unique session ID
|
| 315 |
session_id = str(uuid.uuid4())
|
| 316 |
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
# Save the uploaded file
|
| 320 |
input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
|
| 324 |
# Get output file paths
|
| 325 |
base_filename = os.path.splitext(os.path.basename(file.filename))[0]
|
|
@@ -343,6 +416,7 @@ def convert_pdf():
|
|
| 343 |
"--mfd"
|
| 344 |
]
|
| 345 |
|
|
|
|
| 346 |
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
| 347 |
|
| 348 |
if process.returncode != 0:
|
|
@@ -391,7 +465,7 @@ def convert_pdf():
|
|
| 391 |
finally:
|
| 392 |
# Clean up the input file
|
| 393 |
try:
|
| 394 |
-
if os.path.exists(input_path):
|
| 395 |
os.remove(input_path)
|
| 396 |
except Exception as e:
|
| 397 |
logging.warning(f"Failed to clean up input file: {str(e)}")
|
|
|
|
| 9 |
import json
|
| 10 |
import time
|
| 11 |
import shutil
|
| 12 |
+
import sys
|
| 13 |
|
| 14 |
app = Flask(__name__)
|
| 15 |
CORS(app)
|
| 16 |
|
| 17 |
# Configure logging
|
| 18 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 19 |
+
logger = logging.getLogger()
|
| 20 |
+
logger.addHandler(logging.StreamHandler(sys.stdout))
|
| 21 |
|
| 22 |
# Constants
|
| 23 |
UPLOAD_FOLDER = os.environ.get('UPLOAD_FOLDER', '/tmp/pdf_uploads')
|
|
|
|
| 25 |
ALLOWED_EXTENSIONS = {'pdf'}
|
| 26 |
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
|
| 27 |
|
| 28 |
+
# Ensure the directories exist and are writable
|
| 29 |
+
try:
|
| 30 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 31 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 32 |
+
# Test if we can write to these directories
|
| 33 |
+
test_upload_file = os.path.join(UPLOAD_FOLDER, '.test_file')
|
| 34 |
+
test_output_file = os.path.join(OUTPUT_FOLDER, '.test_file')
|
| 35 |
+
|
| 36 |
+
with open(test_upload_file, 'w') as f:
|
| 37 |
+
f.write('test')
|
| 38 |
+
os.remove(test_upload_file)
|
| 39 |
+
|
| 40 |
+
with open(test_output_file, 'w') as f:
|
| 41 |
+
f.write('test')
|
| 42 |
+
os.remove(test_output_file)
|
| 43 |
+
|
| 44 |
+
logging.info(f"Using upload folder: {UPLOAD_FOLDER}")
|
| 45 |
+
logging.info(f"Using output folder: {OUTPUT_FOLDER}")
|
| 46 |
+
except Exception as e:
|
| 47 |
+
# Fall back to user's home directory
|
| 48 |
+
logging.warning(f"Cannot use specified directories: {str(e)}")
|
| 49 |
+
USER_HOME = os.path.expanduser("~")
|
| 50 |
+
UPLOAD_FOLDER = os.path.join(USER_HOME, 'pdf_uploads')
|
| 51 |
+
OUTPUT_FOLDER = os.path.join(USER_HOME, 'pdf_output')
|
| 52 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
| 53 |
+
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
|
| 54 |
+
logging.info(f"Using fallback upload folder: {UPLOAD_FOLDER}")
|
| 55 |
+
logging.info(f"Using fallback output folder: {OUTPUT_FOLDER}")
|
| 56 |
|
| 57 |
# Function to check if file extension is allowed
|
| 58 |
def allowed_file(filename):
|
|
|
|
| 140 |
a:hover {
|
| 141 |
text-decoration: underline;
|
| 142 |
}
|
| 143 |
+
.system-info {
|
| 144 |
+
font-size: 12px;
|
| 145 |
+
color: #777;
|
| 146 |
+
margin-top: 20px;
|
| 147 |
+
padding-top: 10px;
|
| 148 |
+
border-top: 1px solid #ddd;
|
| 149 |
+
}
|
| 150 |
</style>
|
| 151 |
</head>
|
| 152 |
<body>
|
|
|
|
| 191 |
curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert
|
| 192 |
</div>
|
| 193 |
</div>
|
| 194 |
+
|
| 195 |
+
<div class="system-info">
|
| 196 |
+
<p>Upload directory: ${UPLOAD_FOLDER}</p>
|
| 197 |
+
<p>Output directory: ${OUTPUT_FOLDER}</p>
|
| 198 |
+
<p>Max file size: ${MAX_FILE_SIZE/(1024*1024)}MB</p>
|
| 199 |
+
</div>
|
| 200 |
</div>
|
| 201 |
|
| 202 |
<script>
|
|
|
|
| 278 |
# Route for the main page
|
| 279 |
@app.route('/')
|
| 280 |
def index():
|
| 281 |
+
return render_template_string(HTML_TEMPLATE,
|
| 282 |
+
MAX_FILE_SIZE=MAX_FILE_SIZE,
|
| 283 |
+
UPLOAD_FOLDER=UPLOAD_FOLDER,
|
| 284 |
+
OUTPUT_FOLDER=OUTPUT_FOLDER)
|
| 285 |
|
| 286 |
# Route for the health check
|
| 287 |
@app.route('/health')
|
|
|
|
| 307 |
"doclayout_model": os.path.exists("/tmp/models/layout/doclayout/doclayout_yolo.pt")
|
| 308 |
}
|
| 309 |
|
| 310 |
+
# Check directory permissions
|
| 311 |
+
dir_permissions = {
|
| 312 |
+
"upload_dir": {
|
| 313 |
+
"path": UPLOAD_FOLDER,
|
| 314 |
+
"exists": os.path.exists(UPLOAD_FOLDER),
|
| 315 |
+
"writable": os.access(UPLOAD_FOLDER, os.W_OK)
|
| 316 |
+
},
|
| 317 |
+
"output_dir": {
|
| 318 |
+
"path": OUTPUT_FOLDER,
|
| 319 |
+
"exists": os.path.exists(OUTPUT_FOLDER),
|
| 320 |
+
"writable": os.access(OUTPUT_FOLDER, os.W_OK)
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
return jsonify({
|
| 325 |
"status": "healthy",
|
| 326 |
"version": version,
|
| 327 |
"gpu_available": gpu_available,
|
| 328 |
"model_dirs": model_dirs,
|
| 329 |
+
"directories": dir_permissions,
|
| 330 |
"timestamp": time.time()
|
| 331 |
})
|
| 332 |
except Exception as e:
|
|
|
|
| 373 |
# Create a unique session ID
|
| 374 |
session_id = str(uuid.uuid4())
|
| 375 |
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
|
| 376 |
+
|
| 377 |
+
try:
|
| 378 |
+
os.makedirs(session_dir, exist_ok=True)
|
| 379 |
+
logging.info(f"Created session directory: {session_dir}")
|
| 380 |
+
except PermissionError as e:
|
| 381 |
+
logging.error(f"Permission error creating directory {session_dir}: {str(e)}")
|
| 382 |
+
# Try using a temp directory instead
|
| 383 |
+
session_dir = tempfile.mkdtemp(prefix="minerupdf_")
|
| 384 |
+
logging.info(f"Using temporary directory instead: {session_dir}")
|
| 385 |
|
| 386 |
# Save the uploaded file
|
| 387 |
input_path = os.path.join(UPLOAD_FOLDER, f"{session_id}_{file.filename}")
|
| 388 |
+
try:
|
| 389 |
+
file.save(input_path)
|
| 390 |
+
logging.info(f"Saved uploaded file to {input_path}")
|
| 391 |
+
except PermissionError:
|
| 392 |
+
# Try saving in the temp directory
|
| 393 |
+
input_path = os.path.join(session_dir, f"{session_id}_{file.filename}")
|
| 394 |
+
file.save(input_path)
|
| 395 |
+
logging.info(f"Saved uploaded file to alternate location: {input_path}")
|
| 396 |
|
| 397 |
# Get output file paths
|
| 398 |
base_filename = os.path.splitext(os.path.basename(file.filename))[0]
|
|
|
|
| 416 |
"--mfd"
|
| 417 |
]
|
| 418 |
|
| 419 |
+
logging.info(f"Running command: {' '.join(cmd)}")
|
| 420 |
process = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
| 421 |
|
| 422 |
if process.returncode != 0:
|
|
|
|
| 465 |
finally:
|
| 466 |
# Clean up the input file
|
| 467 |
try:
|
| 468 |
+
if 'input_path' in locals() and os.path.exists(input_path):
|
| 469 |
os.remove(input_path)
|
| 470 |
except Exception as e:
|
| 471 |
logging.warning(f"Failed to clean up input file: {str(e)}")
|
entrypoint.sh
CHANGED
|
@@ -14,6 +14,33 @@ nvidia-smi || echo "No NVIDIA GPU detected, running in CPU mode"
|
|
| 14 |
echo "MinerU version:"
|
| 15 |
magic-pdf --version || echo "Error: MinerU magic-pdf not found"
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Create directories for models if they don't exist
|
| 18 |
mkdir -p /tmp/models/MFD/YOLO
|
| 19 |
mkdir -p /tmp/models/MFR/unimernet
|
|
@@ -42,11 +69,22 @@ MODEL_URLS=(
|
|
| 42 |
)
|
| 43 |
|
| 44 |
for i in "${!MODEL_FILES[@]}"; do
|
| 45 |
-
if [ ! -f "${MODEL_FILES[$i]}" ]; then
|
| 46 |
echo "Downloading ${MODEL_FILES[$i]}..."
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
else
|
| 49 |
-
|
|
|
|
| 50 |
fi
|
| 51 |
done
|
| 52 |
|
|
@@ -58,11 +96,11 @@ mkdir -p $HOME/.config/magic_pdf
|
|
| 58 |
echo "Downloading sample PDF for testing..."
|
| 59 |
if [ ! -f "$HOME/samples/sample.pdf" ]; then
|
| 60 |
# Download a simple paper from arXiv (using a small one for quick processing)
|
| 61 |
-
|
| 62 |
|
| 63 |
# If that fails, try another source
|
| 64 |
if [ ! -s "$HOME/samples/sample.pdf" ]; then
|
| 65 |
-
|
| 66 |
fi
|
| 67 |
|
| 68 |
# If both fail, create a simple PDF with text
|
|
@@ -122,6 +160,10 @@ ls -la /tmp/models/MFR/unimernet/ || echo "UniMERNet models directory issue"
|
|
| 122 |
ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
|
| 123 |
ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
# Start the Flask application
|
| 126 |
echo "Starting Flask application..."
|
| 127 |
python /app/app.py
|
|
|
|
| 14 |
echo "MinerU version:"
|
| 15 |
magic-pdf --version || echo "Error: MinerU magic-pdf not found"
|
| 16 |
|
| 17 |
+
# Ensure output directories have proper permissions
|
| 18 |
+
echo "Checking output directories..."
|
| 19 |
+
for DIR in "/tmp/pdf_uploads" "/tmp/pdf_output" "/tmp/models"; do
|
| 20 |
+
if [ -d "$DIR" ]; then
|
| 21 |
+
echo "Ensuring $DIR is writable..."
|
| 22 |
+
chmod -R 777 "$DIR" || echo "Could not set permissions on $DIR"
|
| 23 |
+
else
|
| 24 |
+
echo "Creating directory $DIR..."
|
| 25 |
+
mkdir -p "$DIR" || echo "Could not create $DIR"
|
| 26 |
+
chmod -R 777 "$DIR" || echo "Could not set permissions on $DIR"
|
| 27 |
+
fi
|
| 28 |
+
done
|
| 29 |
+
|
| 30 |
+
# Fallback to user's home directory if tmp is not writable
|
| 31 |
+
if ! touch "/tmp/test_write_permission" 2>/dev/null; then
|
| 32 |
+
echo "Warning: /tmp directory is not writable, using $HOME instead"
|
| 33 |
+
export UPLOAD_FOLDER="$HOME/pdf_uploads"
|
| 34 |
+
export OUTPUT_FOLDER="$HOME/pdf_output"
|
| 35 |
+
mkdir -p "$UPLOAD_FOLDER" "$OUTPUT_FOLDER"
|
| 36 |
+
echo "Using alternative upload folder: $UPLOAD_FOLDER"
|
| 37 |
+
echo "Using alternative output folder: $OUTPUT_FOLDER"
|
| 38 |
+
else
|
| 39 |
+
rm -f "/tmp/test_write_permission"
|
| 40 |
+
export UPLOAD_FOLDER="/tmp/pdf_uploads"
|
| 41 |
+
export OUTPUT_FOLDER="/tmp/pdf_output"
|
| 42 |
+
fi
|
| 43 |
+
|
| 44 |
# Create directories for models if they don't exist
|
| 45 |
mkdir -p /tmp/models/MFD/YOLO
|
| 46 |
mkdir -p /tmp/models/MFR/unimernet
|
|
|
|
| 69 |
)
|
| 70 |
|
| 71 |
for i in "${!MODEL_FILES[@]}"; do
|
| 72 |
+
if [ ! -f "${MODEL_FILES[$i]}" ] || [ ! -s "${MODEL_FILES[$i]}" ]; then
|
| 73 |
echo "Downloading ${MODEL_FILES[$i]}..."
|
| 74 |
+
curl -L --retry 5 --retry-delay 5 -o "${MODEL_FILES[$i]}" "${MODEL_URLS[$i]}" || echo "Failed to download ${MODEL_FILES[$i]}"
|
| 75 |
+
|
| 76 |
+
# Verify file size
|
| 77 |
+
if [ -f "${MODEL_FILES[$i]}" ]; then
|
| 78 |
+
SIZE=$(stat -c%s "${MODEL_FILES[$i]}" 2>/dev/null || stat -f%z "${MODEL_FILES[$i]}")
|
| 79 |
+
if [ "$SIZE" -eq 0 ]; then
|
| 80 |
+
echo "Warning: Downloaded file ${MODEL_FILES[$i]} is empty!"
|
| 81 |
+
else
|
| 82 |
+
echo "${MODEL_FILES[$i]} downloaded successfully (${SIZE} bytes)"
|
| 83 |
+
fi
|
| 84 |
+
fi
|
| 85 |
else
|
| 86 |
+
SIZE=$(stat -c%s "${MODEL_FILES[$i]}" 2>/dev/null || stat -f%z "${MODEL_FILES[$i]}")
|
| 87 |
+
echo "${MODEL_FILES[$i]} already exists (${SIZE} bytes)."
|
| 88 |
fi
|
| 89 |
done
|
| 90 |
|
|
|
|
| 96 |
echo "Downloading sample PDF for testing..."
|
| 97 |
if [ ! -f "$HOME/samples/sample.pdf" ]; then
|
| 98 |
# Download a simple paper from arXiv (using a small one for quick processing)
|
| 99 |
+
curl -L --retry 3 --retry-delay 3 -o "$HOME/samples/sample.pdf" "https://arxiv.org/pdf/2201.08239.pdf" || true
|
| 100 |
|
| 101 |
# If that fails, try another source
|
| 102 |
if [ ! -s "$HOME/samples/sample.pdf" ]; then
|
| 103 |
+
curl -L --retry 3 -o "$HOME/samples/sample.pdf" "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf" || true
|
| 104 |
fi
|
| 105 |
|
| 106 |
# If both fail, create a simple PDF with text
|
|
|
|
| 160 |
ls -la /tmp/models/table/rapid/ || echo "Table models directory issue"
|
| 161 |
ls -la /tmp/models/layout/doclayout/ || echo "Layout models directory issue"
|
| 162 |
|
| 163 |
+
# Check file sizes to ensure they're not empty
|
| 164 |
+
echo "Checking model file sizes:"
|
| 165 |
+
find /tmp/models -type f -size 0c -exec echo "Warning: Empty file {}" \;
|
| 166 |
+
|
| 167 |
# Start the Flask application
|
| 168 |
echo "Starting Flask application..."
|
| 169 |
python /app/app.py
|