|
|
import os |
|
|
import json |
|
|
import sys |
|
|
from huggingface_hub import snapshot_download |
|
|
import subprocess |
|
|
|
|
|
|
|
|
HOME = os.environ.get("HOME", "/home/user") |
|
|
CONFIG_FILE = os.path.join(HOME, "magic-pdf.json") |
|
|
MODEL_DIR = os.path.join(HOME, "models") |
|
|
|
|
|
print("--- Starting MinerU Setup ---") |
|
|
|
|
|
|
|
|
if not os.path.exists(MODEL_DIR): |
|
|
print(f"Downloading models to {MODEL_DIR}...") |
|
|
try: |
|
|
snapshot_download( |
|
|
"opendatalab/PDF-Extract-Kit-1.0", |
|
|
local_dir=MODEL_DIR, |
|
|
max_workers=4 |
|
|
) |
|
|
print("Model download complete.") |
|
|
except Exception as e: |
|
|
print(f"Error downloading models: {e}") |
|
|
sys.exit(1) |
|
|
else: |
|
|
print("Models found. Skipping download.") |
|
|
|
|
|
|
|
|
|
|
|
REAL_MODEL_DIR = os.path.join(MODEL_DIR, "models") |
|
|
|
|
|
config_data = { |
|
|
"models-dir": REAL_MODEL_DIR, |
|
|
"device-mode": "cpu", |
|
|
"table-config": { |
|
|
"model": "TableMaster", |
|
|
"is_table_recog_enable": False, |
|
|
"max_time": 400 |
|
|
} |
|
|
} |
|
|
|
|
|
print(f"Writing configuration to {CONFIG_FILE}...") |
|
|
with open(CONFIG_FILE, "w") as f: |
|
|
json.dump(config_data, f, indent=4) |
|
|
|
|
|
|
|
|
print("Launching MinerU REST API...") |
|
|
|
|
|
command = [ |
|
|
"mineru-api", |
|
|
"--host", "0.0.0.0", |
|
|
"--port", "7860" |
|
|
] |
|
|
|
|
|
subprocess.run(command) |