File size: 1,647 Bytes
307fda1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import os
import json
import sys
from huggingface_hub import snapshot_download
import subprocess

# 1. Setup Configuration Paths
HOME = os.environ.get("HOME", "/home/user")
CONFIG_FILE = os.path.join(HOME, "magic-pdf.json")
MODEL_DIR = os.path.join(HOME, "models")

print("--- Starting MinerU Setup ---")

# 2. Download Models (if not present)
# Note: This might take a few minutes on the first start!
if not os.path.exists(MODEL_DIR):
    print(f"Downloading models to {MODEL_DIR}...")
    try:
        # Download core models
        snapshot_download(
            "opendatalab/PDF-Extract-Kit-1.0",
            local_dir=MODEL_DIR,
            max_workers=4
        )
        print("Model download complete.")
    except Exception as e:
        print(f"Error downloading models: {e}")
        sys.exit(1)
else:
    print("Models found. Skipping download.")

# 3. Generate magic-pdf.json Config
# MinerU requires this file to know where the models are.
config_data = {
    "models-dir": REAL_MODEL_DIR,
    "device-mode": "cpu",  # Change to "cuda" if you are using a GPU Space
    "table-config": {
        "model": "TableMaster",
        "is_table_recog_enable": False, # Disable table recognition for speed on CPU
        "max_time": 400
    }
}

print(f"Writing configuration to {CONFIG_FILE}...")
with open(CONFIG_FILE, "w") as f:
    json.dump(config_data, f, indent=4)

# 4. Launch the MinerU REST API
print("Launching MinerU REST API...")

# Change the command to use mineru-api instead of mineru-gradio
command = [
    "mineru-api",
    "--host", "0.0.0.0",
    "--port", "7860"  # HF Spaces requires port 7860
]

subprocess.run(command)