minerU / app.py
uatjonas's picture
Upload 3 files
307fda1 verified
raw
history blame
1.65 kB
import os
import json
import sys
from huggingface_hub import snapshot_download
import subprocess
# 1. Setup Configuration Paths
HOME = os.environ.get("HOME", "/home/user")
CONFIG_FILE = os.path.join(HOME, "magic-pdf.json")
MODEL_DIR = os.path.join(HOME, "models")
print("--- Starting MinerU Setup ---")
# 2. Download Models (if not present)
# Note: This might take a few minutes on the first start!
if not os.path.exists(MODEL_DIR):
print(f"Downloading models to {MODEL_DIR}...")
try:
# Download core models
snapshot_download(
"opendatalab/PDF-Extract-Kit-1.0",
local_dir=MODEL_DIR,
max_workers=4
)
print("Model download complete.")
except Exception as e:
print(f"Error downloading models: {e}")
sys.exit(1)
else:
print("Models found. Skipping download.")
# 3. Generate magic-pdf.json Config
# MinerU requires this file to know where the models are.
config_data = {
"models-dir": REAL_MODEL_DIR,
"device-mode": "cpu", # Change to "cuda" if you are using a GPU Space
"table-config": {
"model": "TableMaster",
"is_table_recog_enable": False, # Disable table recognition for speed on CPU
"max_time": 400
}
}
print(f"Writing configuration to {CONFIG_FILE}...")
with open(CONFIG_FILE, "w") as f:
json.dump(config_data, f, indent=4)
# 4. Launch the MinerU REST API
print("Launching MinerU REST API...")
# Change the command to use mineru-api instead of mineru-gradio
command = [
"mineru-api",
"--host", "0.0.0.0",
"--port", "7860" # HF Spaces requires port 7860
]
subprocess.run(command)