uatjonas commited on
Commit
307fda1
·
verified ·
1 Parent(s): 59b3c18

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +33 -0
  2. app.py +58 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies (needed for some PDF tools)
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential \
6
+ wget \
7
+ git \
8
+ libgl1-mesa-glx \
9
+ libglib2.0-0 \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Set up a new user named "user" with user ID 1000
13
+ RUN useradd -m -u 1000 user
14
+ USER user
15
+ ENV HOME=/home/user \
16
+ PATH=/home/user/.local/bin:$PATH
17
+
18
+ WORKDIR $HOME/app
19
+
20
+ # Copy requirements and install Python dependencies
21
+ COPY --chown=user requirements.txt .
22
+ # 1. Install pre-compiled detectron2 (Crucial for CPU spaces)
23
+ RUN pip install --no-cache-dir detectron2 --extra-index-url https://wheels.myhloli.com
24
+
25
+ # 2. Install other dependencies
26
+ RUN pip install --no-cache-dir --upgrade pip && \
27
+ pip install --no-cache-dir -r requirements.txt
28
+
29
+ # Copy the rest of the application
30
+ COPY --chown=user . .
31
+
32
+ # Run the application
33
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import sys
4
+ from huggingface_hub import snapshot_download
5
+ import subprocess
6
+
7
+ # 1. Setup Configuration Paths
8
+ HOME = os.environ.get("HOME", "/home/user")
9
+ CONFIG_FILE = os.path.join(HOME, "magic-pdf.json")
10
+ MODEL_DIR = os.path.join(HOME, "models")
11
+
12
+ print("--- Starting MinerU Setup ---")
13
+
14
+ # 2. Download Models (if not present)
15
+ # Note: This might take a few minutes on the first start!
16
+ if not os.path.exists(MODEL_DIR):
17
+ print(f"Downloading models to {MODEL_DIR}...")
18
+ try:
19
+ # Download core models
20
+ snapshot_download(
21
+ "opendatalab/PDF-Extract-Kit-1.0",
22
+ local_dir=MODEL_DIR,
23
+ max_workers=4
24
+ )
25
+ print("Model download complete.")
26
+ except Exception as e:
27
+ print(f"Error downloading models: {e}")
28
+ sys.exit(1)
29
+ else:
30
+ print("Models found. Skipping download.")
31
+
32
+ # 3. Generate magic-pdf.json Config
33
+ # MinerU requires this file to know where the models are.
34
+ config_data = {
35
+ "models-dir": REAL_MODEL_DIR,
36
+ "device-mode": "cpu", # Change to "cuda" if you are using a GPU Space
37
+ "table-config": {
38
+ "model": "TableMaster",
39
+ "is_table_recog_enable": False, # Disable table recognition for speed on CPU
40
+ "max_time": 400
41
+ }
42
+ }
43
+
44
+ print(f"Writing configuration to {CONFIG_FILE}...")
45
+ with open(CONFIG_FILE, "w") as f:
46
+ json.dump(config_data, f, indent=4)
47
+
48
+ # 4. Launch the MinerU REST API
49
+ print("Launching MinerU REST API...")
50
+
51
+ # Change the command to use mineru-api instead of mineru-gradio
52
+ command = [
53
+ "mineru-api",
54
+ "--host", "0.0.0.0",
55
+ "--port", "7860" # HF Spaces requires port 7860
56
+ ]
57
+
58
+ subprocess.run(command)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ magic-pdf[full]
2
+ mineru
3
+ gradio
4
+ huggingface_hub