RabbitMQ and SVM model
Browse files- .gitattributes +71 -71
- Dockerfile +25 -25
- README.md +17 -17
- app.py +29 -25
- download_models_hf.py +66 -66
- header.html +131 -131
- inference_svm_model.py +29 -0
- mineru_single.py +80 -58
- model_classification/svm_model.joblib +3 -0
- paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams +2 -2
- requirements.txt +32 -29
- svm_model.joblib +3 -0
- worker.py +179 -0
.gitattributes
CHANGED
|
@@ -1,71 +1,71 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
-
magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 39 |
-
paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 40 |
-
paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 41 |
-
paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 42 |
-
magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
-
magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
-
magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 45 |
-
magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 46 |
-
magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 47 |
-
magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 48 |
-
magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 49 |
-
magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
-
magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
-
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 52 |
-
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 53 |
-
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 54 |
-
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 55 |
-
paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 56 |
-
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 57 |
-
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 58 |
-
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 59 |
-
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 60 |
-
paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 61 |
-
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 62 |
-
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 63 |
-
paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 64 |
-
paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 65 |
-
paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 66 |
-
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 67 |
-
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 68 |
-
paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 69 |
-
paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 70 |
-
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 71 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
magic_pdf-0.8.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
magic_pdf-0.8.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
magic_pdf-0.8.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
magic_pdf-0.9.0a1-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
magic_pdf-0.9.0a2-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
magic_pdf-0.9.0a3-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
magic_pdf-0.9.0a4-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
magic_pdf-0.9.0a5-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
magic_pdf-0.9.0a6-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
magic_pdf-0.9.0a7-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
magic_pdf-0.9.0a8-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
magic_pdf-0.9.0a9-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
paddleocr/whl/det/en/en_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
paddleocr/whl/det/ml/Multilingual_PP-OCRv3_det_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
paddleocr/whl/rec/arabic/arabic_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
paddleocr/whl/rec/chinese_cht/chinese_cht_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
paddleocr/whl/rec/devanagari/devanagari_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 63 |
+
paddleocr/whl/rec/japan/japan_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 64 |
+
paddleocr/whl/rec/ka/ka_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 65 |
+
paddleocr/whl/rec/korean/korean_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 66 |
+
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 67 |
+
paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/inference.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
paddleocr/whl/rec/ta/ta_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
paddleocr/whl/rec/te/te_PP-OCRv4_rec_infer/inference.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -1,26 +1,26 @@
|
|
| 1 |
-
FROM python:3.9
|
| 2 |
-
|
| 3 |
-
WORKDIR /code
|
| 4 |
-
|
| 5 |
-
# Install system dependencies
|
| 6 |
-
RUN apt-get update && apt-get install -y \
|
| 7 |
-
build-essential \
|
| 8 |
-
curl \
|
| 9 |
-
software-properties-common \
|
| 10 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
-
|
| 12 |
-
# Copy requirements first to leverage Docker cache
|
| 13 |
-
COPY ./requirements.txt /code/requirements.txt
|
| 14 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
-
|
| 16 |
-
# Copy the rest of the application
|
| 17 |
-
COPY . /code/
|
| 18 |
-
|
| 19 |
-
# Make sure the inbox and output directories exist
|
| 20 |
-
RUN mkdir -p /code/inbox /code/output
|
| 21 |
-
|
| 22 |
-
# Expose the port
|
| 23 |
-
EXPOSE 7860
|
| 24 |
-
|
| 25 |
-
# Command to run the application
|
| 26 |
CMD ["python", "app.py"]
|
|
|
|
| 1 |
+
FROM python:3.9
|
| 2 |
+
|
| 3 |
+
WORKDIR /code
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
curl \
|
| 9 |
+
software-properties-common \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Copy requirements first to leverage Docker cache
|
| 13 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Copy the rest of the application
|
| 17 |
+
COPY . /code/
|
| 18 |
+
|
| 19 |
+
# Make sure the inbox and output directories exist
|
| 20 |
+
RUN mkdir -p /code/inbox /code/output
|
| 21 |
+
|
| 22 |
+
# Expose the port
|
| 23 |
+
EXPOSE 7860
|
| 24 |
+
|
| 25 |
+
# Command to run the application
|
| 26 |
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,18 +1,18 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: MinerU
|
| 3 |
-
emoji: 📚
|
| 4 |
-
colorFrom: purple
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.8.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: agpl-3.0
|
| 11 |
-
models:
|
| 12 |
-
- opendatalab/PDF-Extract-Kit-1.0
|
| 13 |
-
- hantian/layoutreader
|
| 14 |
-
---
|
| 15 |
-
|
| 16 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 17 |
-
|
| 18 |
Paper: https://huggingface.co/papers/2409.18839
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MinerU
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.8.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: agpl-3.0
|
| 11 |
+
models:
|
| 12 |
+
- opendatalab/PDF-Extract-Kit-1.0
|
| 13 |
+
- hantian/layoutreader
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 17 |
+
|
| 18 |
Paper: https://huggingface.co/papers/2409.18839
|
app.py
CHANGED
|
@@ -1,54 +1,58 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import os
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
import uvicorn
|
| 6 |
-
|
|
|
|
| 7 |
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
-
from mineru_single import Processor
|
| 9 |
|
| 10 |
-
processor = Processor()
|
| 11 |
app = FastAPI()
|
| 12 |
-
logging.basicConfig(level=logging.INFO)
|
| 13 |
|
| 14 |
-
|
|
|
|
| 15 |
app.add_middleware(
|
| 16 |
CORSMiddleware,
|
| 17 |
-
allow_origins=["*"],
|
| 18 |
allow_credentials=True,
|
| 19 |
-
allow_methods=["*"],
|
| 20 |
-
allow_headers=["*"],
|
| 21 |
)
|
| 22 |
|
| 23 |
-
|
| 24 |
@app.get("/")
|
| 25 |
async def root():
|
| 26 |
-
"""Health check endpoint"""
|
| 27 |
return {"status": "ok", "message": "API is running"}
|
| 28 |
|
| 29 |
@app.post("/process")
|
| 30 |
async def process_pdf(
|
| 31 |
-
|
| 32 |
x_api_key: str = Header(None, alias="X-API-Key")
|
| 33 |
):
|
| 34 |
-
# Get the secret key from environment variable
|
| 35 |
-
api_key = os.getenv("SECRET_KEY")
|
| 36 |
-
|
| 37 |
if not x_api_key:
|
| 38 |
raise HTTPException(status_code=401, detail="API key is missing")
|
| 39 |
-
|
| 40 |
-
if x_api_key != api_key:
|
| 41 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
| 42 |
|
| 43 |
-
#
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
return {
|
| 47 |
-
"message": "
|
| 48 |
-
"
|
| 49 |
-
"content": markdown_text
|
| 50 |
}
|
| 51 |
|
| 52 |
-
# If you want to run locally or for debug:
|
| 53 |
if __name__ == "__main__":
|
| 54 |
-
uvicorn.run(app, host="0.0.0.0", port=
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import os
|
| 3 |
+
import json
|
| 4 |
+
import uuid
|
| 5 |
import uvicorn
|
| 6 |
+
import pika
|
| 7 |
+
from fastapi import FastAPI, Body, Header, HTTPException
|
| 8 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 9 |
|
|
|
|
| 10 |
app = FastAPI()
|
|
|
|
| 11 |
|
| 12 |
+
API_KEY = os.getenv("SECRET_KEY")
|
| 13 |
+
|
| 14 |
app.add_middleware(
|
| 15 |
CORSMiddleware,
|
| 16 |
+
allow_origins=["*"],
|
| 17 |
allow_credentials=True,
|
| 18 |
+
allow_methods=["*"],
|
| 19 |
+
allow_headers=["*"],
|
| 20 |
)
|
| 21 |
|
|
|
|
| 22 |
@app.get("/")
|
| 23 |
async def root():
|
|
|
|
| 24 |
return {"status": "ok", "message": "API is running"}
|
| 25 |
|
| 26 |
@app.post("/process")
|
| 27 |
async def process_pdf(
|
| 28 |
+
input_json: dict = Body(...),
|
| 29 |
x_api_key: str = Header(None, alias="X-API-Key")
|
| 30 |
):
|
|
|
|
|
|
|
|
|
|
| 31 |
if not x_api_key:
|
| 32 |
raise HTTPException(status_code=401, detail="API key is missing")
|
| 33 |
+
if x_api_key != API_KEY:
|
|
|
|
| 34 |
raise HTTPException(status_code=401, detail="Invalid API key")
|
| 35 |
|
| 36 |
+
# Connect to RabbitMQ
|
| 37 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
| 38 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
| 39 |
+
channel = connection.channel()
|
| 40 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
| 41 |
+
|
| 42 |
+
channel.basic_publish(
|
| 43 |
+
exchange="",
|
| 44 |
+
routing_key="ml_server",
|
| 45 |
+
body=json.dumps(input_json),
|
| 46 |
+
properties=pika.BasicProperties(
|
| 47 |
+
headers={"process": "topic_extraction"}
|
| 48 |
+
)
|
| 49 |
+
)
|
| 50 |
+
connection.close()
|
| 51 |
|
| 52 |
return {
|
| 53 |
+
"message": "Job queued",
|
| 54 |
+
"request_id": input_json.get("headers", {}).get("request_id", str(uuid.uuid4()))
|
|
|
|
| 55 |
}
|
| 56 |
|
|
|
|
| 57 |
if __name__ == "__main__":
|
| 58 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
download_models_hf.py
CHANGED
|
@@ -1,66 +1,66 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
import requests
|
| 5 |
-
from huggingface_hub import snapshot_download
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def download_json(url):
|
| 9 |
-
# 下载JSON文件
|
| 10 |
-
response = requests.get(url)
|
| 11 |
-
response.raise_for_status() # 检查请求是否成功
|
| 12 |
-
return response.json()
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def download_and_modify_json(url, local_filename, modifications):
|
| 16 |
-
if os.path.exists(local_filename):
|
| 17 |
-
data = json.load(open(local_filename))
|
| 18 |
-
config_version = data.get('config_version', '0.0.0')
|
| 19 |
-
if config_version < '1.1.1':
|
| 20 |
-
data = download_json(url)
|
| 21 |
-
else:
|
| 22 |
-
data = download_json(url)
|
| 23 |
-
|
| 24 |
-
# 修改内容
|
| 25 |
-
for key, value in modifications.items():
|
| 26 |
-
data[key] = value
|
| 27 |
-
|
| 28 |
-
# 保存修改后的内容
|
| 29 |
-
with open(local_filename, 'w', encoding='utf-8') as f:
|
| 30 |
-
json.dump(data, f, ensure_ascii=False, indent=4)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
if __name__ == '__main__':
|
| 34 |
-
|
| 35 |
-
mineru_patterns = [
|
| 36 |
-
"models/Layout/LayoutLMv3/*",
|
| 37 |
-
"models/Layout/YOLO/*",
|
| 38 |
-
"models/MFD/YOLO/*",
|
| 39 |
-
"models/MFR/unimernet_small_2501/*",
|
| 40 |
-
"models/TabRec/TableMaster/*",
|
| 41 |
-
"models/TabRec/StructEqTable/*",
|
| 42 |
-
]
|
| 43 |
-
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
|
| 44 |
-
|
| 45 |
-
layoutreader_pattern = [
|
| 46 |
-
"*.json",
|
| 47 |
-
"*.safetensors",
|
| 48 |
-
]
|
| 49 |
-
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
|
| 50 |
-
|
| 51 |
-
model_dir = model_dir + '/models'
|
| 52 |
-
print(f'model_dir is: {model_dir}')
|
| 53 |
-
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
|
| 54 |
-
|
| 55 |
-
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
|
| 56 |
-
config_file_name = 'magic-pdf.json'
|
| 57 |
-
home_dir = os.path.expanduser('~')
|
| 58 |
-
config_file = os.path.join(home_dir, config_file_name)
|
| 59 |
-
|
| 60 |
-
json_mods = {
|
| 61 |
-
'models-dir': model_dir,
|
| 62 |
-
'layoutreader-model-dir': layoutreader_model_dir,
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
download_and_modify_json(json_url, config_file, json_mods)
|
| 66 |
-
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
import requests
|
| 5 |
+
from huggingface_hub import snapshot_download
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def download_json(url):
|
| 9 |
+
# 下载JSON文件
|
| 10 |
+
response = requests.get(url)
|
| 11 |
+
response.raise_for_status() # 检查请求是否成功
|
| 12 |
+
return response.json()
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def download_and_modify_json(url, local_filename, modifications):
|
| 16 |
+
if os.path.exists(local_filename):
|
| 17 |
+
data = json.load(open(local_filename))
|
| 18 |
+
config_version = data.get('config_version', '0.0.0')
|
| 19 |
+
if config_version < '1.1.1':
|
| 20 |
+
data = download_json(url)
|
| 21 |
+
else:
|
| 22 |
+
data = download_json(url)
|
| 23 |
+
|
| 24 |
+
# 修改内容
|
| 25 |
+
for key, value in modifications.items():
|
| 26 |
+
data[key] = value
|
| 27 |
+
|
| 28 |
+
# 保存修改后的内容
|
| 29 |
+
with open(local_filename, 'w', encoding='utf-8') as f:
|
| 30 |
+
json.dump(data, f, ensure_ascii=False, indent=4)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
if __name__ == '__main__':
|
| 34 |
+
|
| 35 |
+
mineru_patterns = [
|
| 36 |
+
"models/Layout/LayoutLMv3/*",
|
| 37 |
+
"models/Layout/YOLO/*",
|
| 38 |
+
"models/MFD/YOLO/*",
|
| 39 |
+
"models/MFR/unimernet_small_2501/*",
|
| 40 |
+
"models/TabRec/TableMaster/*",
|
| 41 |
+
"models/TabRec/StructEqTable/*",
|
| 42 |
+
]
|
| 43 |
+
model_dir = snapshot_download('opendatalab/PDF-Extract-Kit-1.0', allow_patterns=mineru_patterns)
|
| 44 |
+
|
| 45 |
+
layoutreader_pattern = [
|
| 46 |
+
"*.json",
|
| 47 |
+
"*.safetensors",
|
| 48 |
+
]
|
| 49 |
+
layoutreader_model_dir = snapshot_download('hantian/layoutreader', allow_patterns=layoutreader_pattern)
|
| 50 |
+
|
| 51 |
+
model_dir = model_dir + '/models'
|
| 52 |
+
print(f'model_dir is: {model_dir}')
|
| 53 |
+
print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
|
| 54 |
+
|
| 55 |
+
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
|
| 56 |
+
config_file_name = 'magic-pdf.json'
|
| 57 |
+
home_dir = os.path.expanduser('~')
|
| 58 |
+
config_file = os.path.join(home_dir, config_file_name)
|
| 59 |
+
|
| 60 |
+
json_mods = {
|
| 61 |
+
'models-dir': model_dir,
|
| 62 |
+
'layoutreader-model-dir': layoutreader_model_dir,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
download_and_modify_json(json_url, config_file, json_mods)
|
| 66 |
+
print(f'The configuration file has been configured successfully, the path is: {config_file}')
|
header.html
CHANGED
|
@@ -1,132 +1,132 @@
|
|
| 1 |
-
<html>
|
| 2 |
-
<head>
|
| 3 |
-
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
|
| 4 |
-
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
| 5 |
-
<style>
|
| 6 |
-
.link-block {
|
| 7 |
-
border: 1px solid transparent;
|
| 8 |
-
border-radius: 24px;
|
| 9 |
-
background-color: rgba(54, 54, 54, 1);
|
| 10 |
-
cursor: pointer !important;
|
| 11 |
-
}
|
| 12 |
-
.link-block:hover {
|
| 13 |
-
background-color: rgba(54, 54, 54, 0.75) !important;
|
| 14 |
-
cursor: pointer !important;
|
| 15 |
-
}
|
| 16 |
-
.external-link {
|
| 17 |
-
display: inline-flex;
|
| 18 |
-
align-items: center;
|
| 19 |
-
height: 36px;
|
| 20 |
-
line-height: 36px;
|
| 21 |
-
padding: 0 16px;
|
| 22 |
-
cursor: pointer !important;
|
| 23 |
-
}
|
| 24 |
-
.external-link,
|
| 25 |
-
.external-link:hover {
|
| 26 |
-
cursor: pointer !important;
|
| 27 |
-
}
|
| 28 |
-
a {
|
| 29 |
-
text-decoration: none;
|
| 30 |
-
}
|
| 31 |
-
</style>
|
| 32 |
-
</head>
|
| 33 |
-
|
| 34 |
-
<body>
|
| 35 |
-
<div style="
|
| 36 |
-
display: flex;
|
| 37 |
-
flex-direction: column;
|
| 38 |
-
justify-content: center;
|
| 39 |
-
align-items: center;
|
| 40 |
-
text-align: center;
|
| 41 |
-
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
| 42 |
-
padding: 24px;
|
| 43 |
-
gap: 24px;
|
| 44 |
-
border-radius: 8px;
|
| 45 |
-
">
|
| 46 |
-
<div style="
|
| 47 |
-
display: flex;
|
| 48 |
-
flex-direction: column;
|
| 49 |
-
align-items: center;
|
| 50 |
-
gap: 16px;
|
| 51 |
-
">
|
| 52 |
-
<div style="display: flex; flex-direction: column; gap: 8px">
|
| 53 |
-
<h1 style="
|
| 54 |
-
font-size: 48px;
|
| 55 |
-
color: #fafafa;
|
| 56 |
-
margin: 0;
|
| 57 |
-
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
| 58 |
-
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
| 59 |
-
">
|
| 60 |
-
MinerU: PDF Extraction Demo
|
| 61 |
-
</h1>
|
| 62 |
-
</div>
|
| 63 |
-
</div>
|
| 64 |
-
|
| 65 |
-
<p style="
|
| 66 |
-
margin: 0;
|
| 67 |
-
line-height: 1.6rem;
|
| 68 |
-
font-size: 16px;
|
| 69 |
-
color: #fafafa;
|
| 70 |
-
opacity: 0.8;
|
| 71 |
-
">
|
| 72 |
-
A one-stop, open-source, high-quality data extraction tool, supports
|
| 73 |
-
PDF/webpage/e-book extraction.<br>
|
| 74 |
-
</p>
|
| 75 |
-
<style>
|
| 76 |
-
.link-block {
|
| 77 |
-
display: inline-block;
|
| 78 |
-
}
|
| 79 |
-
.link-block + .link-block {
|
| 80 |
-
margin-left: 20px;
|
| 81 |
-
}
|
| 82 |
-
</style>
|
| 83 |
-
|
| 84 |
-
<div class="column has-text-centered">
|
| 85 |
-
<div class="publication-links">
|
| 86 |
-
<!-- Code Link. -->
|
| 87 |
-
<span class="link-block">
|
| 88 |
-
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 89 |
-
<span class="icon" style="margin-right: 4px">
|
| 90 |
-
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
| 91 |
-
</span>
|
| 92 |
-
<span style="color: white">Code</span>
|
| 93 |
-
</a>
|
| 94 |
-
</span>
|
| 95 |
-
|
| 96 |
-
<!-- arXiv Link. -->
|
| 97 |
-
<span class="link-block">
|
| 98 |
-
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 99 |
-
<span class="icon" style="margin-right: 8px">
|
| 100 |
-
<i class="fas fa-file" style="color: white"></i>
|
| 101 |
-
</span>
|
| 102 |
-
<span style="color: white">Paper</span>
|
| 103 |
-
</a>
|
| 104 |
-
</span>
|
| 105 |
-
|
| 106 |
-
<!-- Homepage Link. -->
|
| 107 |
-
<span class="link-block">
|
| 108 |
-
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 109 |
-
<span class="icon" style="margin-right: 8px">
|
| 110 |
-
<i class="fas fa-home" style="color: white"></i>
|
| 111 |
-
</span>
|
| 112 |
-
<span style="color: white">Homepage</span>
|
| 113 |
-
</a>
|
| 114 |
-
</span>
|
| 115 |
-
|
| 116 |
-
<!-- Client Link. -->
|
| 117 |
-
<span class="link-block">
|
| 118 |
-
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 119 |
-
<span class="icon" style="margin-right: 8px">
|
| 120 |
-
<i class="fas fa-download" style="color: white"></i>
|
| 121 |
-
</span>
|
| 122 |
-
<span style="color: white">Download</span>
|
| 123 |
-
</a>
|
| 124 |
-
</span>
|
| 125 |
-
</div>
|
| 126 |
-
</div>
|
| 127 |
-
|
| 128 |
-
<!-- New Demo Links -->
|
| 129 |
-
</div>
|
| 130 |
-
|
| 131 |
-
|
| 132 |
</body></html>
|
|
|
|
| 1 |
+
<html>
|
| 2 |
+
<head>
|
| 3 |
+
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.3/css/bulma.min.css"> -->
|
| 4 |
+
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
|
| 5 |
+
<style>
|
| 6 |
+
.link-block {
|
| 7 |
+
border: 1px solid transparent;
|
| 8 |
+
border-radius: 24px;
|
| 9 |
+
background-color: rgba(54, 54, 54, 1);
|
| 10 |
+
cursor: pointer !important;
|
| 11 |
+
}
|
| 12 |
+
.link-block:hover {
|
| 13 |
+
background-color: rgba(54, 54, 54, 0.75) !important;
|
| 14 |
+
cursor: pointer !important;
|
| 15 |
+
}
|
| 16 |
+
.external-link {
|
| 17 |
+
display: inline-flex;
|
| 18 |
+
align-items: center;
|
| 19 |
+
height: 36px;
|
| 20 |
+
line-height: 36px;
|
| 21 |
+
padding: 0 16px;
|
| 22 |
+
cursor: pointer !important;
|
| 23 |
+
}
|
| 24 |
+
.external-link,
|
| 25 |
+
.external-link:hover {
|
| 26 |
+
cursor: pointer !important;
|
| 27 |
+
}
|
| 28 |
+
a {
|
| 29 |
+
text-decoration: none;
|
| 30 |
+
}
|
| 31 |
+
</style>
|
| 32 |
+
</head>
|
| 33 |
+
|
| 34 |
+
<body>
|
| 35 |
+
<div style="
|
| 36 |
+
display: flex;
|
| 37 |
+
flex-direction: column;
|
| 38 |
+
justify-content: center;
|
| 39 |
+
align-items: center;
|
| 40 |
+
text-align: center;
|
| 41 |
+
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
|
| 42 |
+
padding: 24px;
|
| 43 |
+
gap: 24px;
|
| 44 |
+
border-radius: 8px;
|
| 45 |
+
">
|
| 46 |
+
<div style="
|
| 47 |
+
display: flex;
|
| 48 |
+
flex-direction: column;
|
| 49 |
+
align-items: center;
|
| 50 |
+
gap: 16px;
|
| 51 |
+
">
|
| 52 |
+
<div style="display: flex; flex-direction: column; gap: 8px">
|
| 53 |
+
<h1 style="
|
| 54 |
+
font-size: 48px;
|
| 55 |
+
color: #fafafa;
|
| 56 |
+
margin: 0;
|
| 57 |
+
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
|
| 58 |
+
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
|
| 59 |
+
">
|
| 60 |
+
MinerU: PDF Extraction Demo
|
| 61 |
+
</h1>
|
| 62 |
+
</div>
|
| 63 |
+
</div>
|
| 64 |
+
|
| 65 |
+
<p style="
|
| 66 |
+
margin: 0;
|
| 67 |
+
line-height: 1.6rem;
|
| 68 |
+
font-size: 16px;
|
| 69 |
+
color: #fafafa;
|
| 70 |
+
opacity: 0.8;
|
| 71 |
+
">
|
| 72 |
+
A one-stop, open-source, high-quality data extraction tool, supports
|
| 73 |
+
PDF/webpage/e-book extraction.<br>
|
| 74 |
+
</p>
|
| 75 |
+
<style>
|
| 76 |
+
.link-block {
|
| 77 |
+
display: inline-block;
|
| 78 |
+
}
|
| 79 |
+
.link-block + .link-block {
|
| 80 |
+
margin-left: 20px;
|
| 81 |
+
}
|
| 82 |
+
</style>
|
| 83 |
+
|
| 84 |
+
<div class="column has-text-centered">
|
| 85 |
+
<div class="publication-links">
|
| 86 |
+
<!-- Code Link. -->
|
| 87 |
+
<span class="link-block">
|
| 88 |
+
<a href="https://github.com/opendatalab/MinerU" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 89 |
+
<span class="icon" style="margin-right: 4px">
|
| 90 |
+
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
|
| 91 |
+
</span>
|
| 92 |
+
<span style="color: white">Code</span>
|
| 93 |
+
</a>
|
| 94 |
+
</span>
|
| 95 |
+
|
| 96 |
+
<!-- arXiv Link. -->
|
| 97 |
+
<span class="link-block">
|
| 98 |
+
<a href="https://arxiv.org/abs/2409.18839" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 99 |
+
<span class="icon" style="margin-right: 8px">
|
| 100 |
+
<i class="fas fa-file" style="color: white"></i>
|
| 101 |
+
</span>
|
| 102 |
+
<span style="color: white">Paper</span>
|
| 103 |
+
</a>
|
| 104 |
+
</span>
|
| 105 |
+
|
| 106 |
+
<!-- Homepage Link. -->
|
| 107 |
+
<span class="link-block">
|
| 108 |
+
<a href="https://mineru.org.cn/home?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 109 |
+
<span class="icon" style="margin-right: 8px">
|
| 110 |
+
<i class="fas fa-home" style="color: white"></i>
|
| 111 |
+
</span>
|
| 112 |
+
<span style="color: white">Homepage</span>
|
| 113 |
+
</a>
|
| 114 |
+
</span>
|
| 115 |
+
|
| 116 |
+
<!-- Client Link. -->
|
| 117 |
+
<span class="link-block">
|
| 118 |
+
<a href="https://mineru.org.cn/client?source=huggingface" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
|
| 119 |
+
<span class="icon" style="margin-right: 8px">
|
| 120 |
+
<i class="fas fa-download" style="color: white"></i>
|
| 121 |
+
</span>
|
| 122 |
+
<span style="color: white">Download</span>
|
| 123 |
+
</a>
|
| 124 |
+
</span>
|
| 125 |
+
</div>
|
| 126 |
+
</div>
|
| 127 |
+
|
| 128 |
+
<!-- New Demo Links -->
|
| 129 |
+
</div>
|
| 130 |
+
|
| 131 |
+
|
| 132 |
</body></html>
|
inference_svm_model.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
from joblib import load
|
| 5 |
+
|
| 6 |
+
def load_svm_model(model_path: str):
|
| 7 |
+
return load(model_path)
|
| 8 |
+
|
| 9 |
+
def classify_image(
|
| 10 |
+
image_path: str,
|
| 11 |
+
loaded_model,
|
| 12 |
+
label_map: dict,
|
| 13 |
+
image_size=(128, 128)
|
| 14 |
+
) -> str:
|
| 15 |
+
img = cv2.imread(image_path)
|
| 16 |
+
if img is None:
|
| 17 |
+
# If image fails to load, default to "irrelevant" or handle differently
|
| 18 |
+
return label_map[0]
|
| 19 |
+
|
| 20 |
+
img = cv2.resize(img, image_size)
|
| 21 |
+
x = img.flatten().reshape(1, -1)
|
| 22 |
+
pred = loaded_model.predict(x)[0]
|
| 23 |
+
return label_map[pred]
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
model = load_svm_model("./model_classification/svm_model.joblib")
|
| 27 |
+
label_map = {0: "irrelevant", 1: "relevant"}
|
| 28 |
+
result = classify_image("test.jpg", model, label_map)
|
| 29 |
+
print("Classification result:", result)
|
mineru_single.py
CHANGED
|
@@ -1,37 +1,16 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import os
|
| 3 |
-
import time
|
| 4 |
-
import base64
|
| 5 |
-
import json
|
| 6 |
-
import re
|
| 7 |
import uuid
|
| 8 |
-
|
| 9 |
-
from loguru import logger
|
| 10 |
import requests
|
| 11 |
-
from
|
| 12 |
-
|
| 13 |
-
import pymupdf
|
| 14 |
-
from magic_pdf.data.data_reader_writer.base import DataWriter
|
| 15 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 16 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 17 |
from magic_pdf.data.io.s3 import S3Writer
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
# def to_pdf(file_path):
|
| 21 |
-
# """
|
| 22 |
-
# If input is not PDF, convert it to PDF using PyMuPDF
|
| 23 |
-
# """
|
| 24 |
-
# with pymupdf.open(file_path) as doc:
|
| 25 |
-
# if doc.is_pdf:
|
| 26 |
-
# return file_path
|
| 27 |
-
# else:
|
| 28 |
-
# pdf_bytes = doc.convert_to_pdf()
|
| 29 |
-
# unique_filename = f"{uuid.uuid4()}.pdf"
|
| 30 |
-
# tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename)
|
| 31 |
-
# with open(tmp_file_path, "wb") as tmp_pdf_file:
|
| 32 |
-
# tmp_pdf_file.write(pdf_bytes)
|
| 33 |
-
# return tmp_file_path
|
| 34 |
-
|
| 35 |
|
| 36 |
class Processor:
|
| 37 |
def __init__(self):
|
|
@@ -41,61 +20,104 @@ class Processor:
|
|
| 41 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 42 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
| 43 |
)
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
with open("/home/user/magic-pdf.json", "r") as f:
|
| 47 |
config = json.load(f)
|
|
|
|
| 48 |
self.layout_mode = config["layout-config"]["model"]
|
| 49 |
self.formula_enable = config["formula-config"]["enable"]
|
| 50 |
self.table_enable = config["table-config"]["enable"]
|
| 51 |
self.language = "en"
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
os.system('wget https://github.com/opendatalab/MinerU/raw/dev/scripts/download_models_hf.py -O download_models_hf.py')
|
| 61 |
-
os.system('python download_models_hf.py')
|
| 62 |
-
|
| 63 |
-
def process(self, file_link: str, file_name: str = str(uuid.uuid4())):
|
| 64 |
-
print("Processing file")
|
| 65 |
-
response = requests.get(file_link)
|
| 66 |
if response.status_code != 200:
|
| 67 |
-
raise Exception(f"Failed to download
|
| 68 |
pdf_bytes = response.content
|
| 69 |
|
| 70 |
dataset = PymuDocDataset(pdf_bytes)
|
| 71 |
-
inference = doc_analyze(
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
class ImageWriter(DataWriter):
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
self._redundant_images_paths = []
|
| 81 |
|
| 82 |
-
def _process_image(self, data: bytes) -> str:
|
| 83 |
-
# TODO: actually process image
|
| 84 |
-
return True
|
| 85 |
-
|
| 86 |
def write(self, path: str, data: bytes) -> None:
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
else:
|
| 91 |
self._redundant_images_paths.append(path)
|
| 92 |
|
| 93 |
-
def remove_redundant_images(self, md_content: str):
|
| 94 |
for path in self._redundant_images_paths:
|
| 95 |
md_content = md_content.replace(f"", "")
|
| 96 |
return md_content
|
| 97 |
|
| 98 |
if __name__ == "__main__":
|
| 99 |
processor = Processor()
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import uuid
|
| 4 |
+
import json
|
|
|
|
| 5 |
import requests
|
| 6 |
+
from loguru import logger
|
| 7 |
+
|
|
|
|
|
|
|
| 8 |
from magic_pdf.data.dataset import PymuDocDataset
|
| 9 |
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
| 10 |
from magic_pdf.data.io.s3 import S3Writer
|
| 11 |
+
from magic_pdf.data.data_reader_writer.base import DataWriter
|
| 12 |
|
| 13 |
+
from inference_svm_model import load_svm_model, classify_image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
class Processor:
|
| 16 |
def __init__(self):
|
|
|
|
| 20 |
bucket=os.getenv("S3_BUCKET_NAME"),
|
| 21 |
endpoint_url=os.getenv("S3_ENDPOINT"),
|
| 22 |
)
|
| 23 |
+
|
| 24 |
+
model_path = os.getenv("SVM_MODEL_PATH", "./svm_model/svm_model.joblib")
|
| 25 |
+
self.svm_model = load_svm_model(model_path)
|
| 26 |
+
self.label_map = {0: "irrelevant", 1: "relevant"}
|
| 27 |
|
| 28 |
with open("/home/user/magic-pdf.json", "r") as f:
|
| 29 |
config = json.load(f)
|
| 30 |
+
|
| 31 |
self.layout_mode = config["layout-config"]["model"]
|
| 32 |
self.formula_enable = config["formula-config"]["enable"]
|
| 33 |
self.table_enable = config["table-config"]["enable"]
|
| 34 |
self.language = "en"
|
| 35 |
+
|
| 36 |
+
endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
|
| 37 |
+
bucket = os.getenv("S3_BUCKET_NAME", "")
|
| 38 |
+
self.prefix = f"{endpoint}/{bucket}/document-extracts/"
|
| 39 |
+
|
| 40 |
+
def process(self, file_url: str) -> str:
|
| 41 |
+
logger.info("Processing file: {}", file_url)
|
| 42 |
+
response = requests.get(file_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if response.status_code != 200:
|
| 44 |
+
raise Exception(f"Failed to download PDF: {file_url}")
|
| 45 |
pdf_bytes = response.content
|
| 46 |
|
| 47 |
dataset = PymuDocDataset(pdf_bytes)
|
| 48 |
+
inference = doc_analyze(
|
| 49 |
+
dataset,
|
| 50 |
+
ocr=True,
|
| 51 |
+
lang=self.language,
|
| 52 |
+
layout_model=self.layout_mode,
|
| 53 |
+
formula_enable=self.formula_enable,
|
| 54 |
+
table_enable=self.table_enable
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
image_writer = ImageWriter(self.s3_writer, self.svm_model, self.label_map)
|
| 58 |
+
|
| 59 |
+
pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
|
| 60 |
|
| 61 |
+
folder_name = str(uuid.uuid4())
|
| 62 |
+
md_content = pipe_result.get_markdown(self.prefix + folder_name + "/")
|
| 63 |
+
|
| 64 |
+
# Remove references to images classified as "irrelevant"
|
| 65 |
+
final_markdown = image_writer.remove_redundant_images(md_content)
|
| 66 |
+
return final_markdown
|
| 67 |
+
|
| 68 |
+
def process_batch(self, file_urls: list[str]) -> dict:
|
| 69 |
+
results = {}
|
| 70 |
+
for url in file_urls:
|
| 71 |
+
try:
|
| 72 |
+
md = self.process(url)
|
| 73 |
+
results[url] = md
|
| 74 |
+
except Exception as e:
|
| 75 |
+
results[url] = f"Error: {str(e)}"
|
| 76 |
+
return results
|
| 77 |
|
| 78 |
class ImageWriter(DataWriter):
|
| 79 |
+
"""
|
| 80 |
+
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
| 81 |
+
it for removal if irrelevant.
|
| 82 |
+
"""
|
| 83 |
+
def __init__(self, s3_writer: S3Writer, svm_model, label_map):
|
| 84 |
+
self.s3_writer = s3_writer
|
| 85 |
+
self.svm_model = svm_model
|
| 86 |
+
self.label_map = label_map
|
| 87 |
self._redundant_images_paths = []
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def write(self, path: str, data: bytes) -> None:
|
| 90 |
+
import tempfile
|
| 91 |
+
import os
|
| 92 |
+
import uuid
|
| 93 |
+
|
| 94 |
+
tmp_name = f"{uuid.uuid4()}.jpg"
|
| 95 |
+
tmp_path = os.path.join(tempfile.gettempdir(), tmp_name)
|
| 96 |
+
with open(tmp_path, "wb") as f:
|
| 97 |
+
f.write(data)
|
| 98 |
+
|
| 99 |
+
label_str = classify_image(tmp_path, self.svm_model, self.label_map)
|
| 100 |
+
|
| 101 |
+
os.remove(tmp_path)
|
| 102 |
+
|
| 103 |
+
if label_str == "relevant":
|
| 104 |
+
# Upload to S3
|
| 105 |
+
self.s3_writer.write(path, data)
|
| 106 |
else:
|
| 107 |
self._redundant_images_paths.append(path)
|
| 108 |
|
| 109 |
+
def remove_redundant_images(self, md_content: str) -> str:
|
| 110 |
for path in self._redundant_images_paths:
|
| 111 |
md_content = md_content.replace(f"", "")
|
| 112 |
return md_content
|
| 113 |
|
| 114 |
if __name__ == "__main__":
|
| 115 |
processor = Processor()
|
| 116 |
+
|
| 117 |
+
single_url = "https://example.com/somefile.pdf"
|
| 118 |
+
markdown_result = processor.process(single_url)
|
| 119 |
+
print("Single file Markdown:\n", markdown_result)
|
| 120 |
+
|
| 121 |
+
multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
|
| 122 |
+
batch_results = processor.process_batch(multiple_urls)
|
| 123 |
+
print("Batch results:", batch_results)
|
model_classification/svm_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
|
| 3 |
+
size 219034075
|
paddleocr/whl/rec/cyrillic/cyrillic_PP-OCRv3_rec_infer/inference.pdiparams
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1f185677978a6e3e908a7123d2f37ff64dd5ae87e594dd1281331aedb26ad27
|
| 3 |
+
size 6946816
|
requirements.txt
CHANGED
|
@@ -1,29 +1,32 @@
|
|
| 1 |
-
boto3>=1.28.43
|
| 2 |
-
Brotli>=1.1.0
|
| 3 |
-
click>=8.1.7
|
| 4 |
-
PyMuPDF>=1.24.9,<1.24.14
|
| 5 |
-
loguru>=0.6.0
|
| 6 |
-
numpy>=1.21.6,<2.0.0
|
| 7 |
-
fast-langdetect>=0.2.3
|
| 8 |
-
scikit-learn>=1.0.2
|
| 9 |
-
transformers
|
| 10 |
-
pdfminer.six==20231228
|
| 11 |
-
unimernet==0.2.3
|
| 12 |
-
doclayout_yolo==0.0.2b1
|
| 13 |
-
matplotlib
|
| 14 |
-
ultralytics>=8.3.48
|
| 15 |
-
paddleocr==2.7.3
|
| 16 |
-
paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
|
| 17 |
-
struct-eqtable==0.3.2
|
| 18 |
-
detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
|
| 19 |
-
magic-pdf>=1.0.1
|
| 20 |
-
torch>=2.2.2,<=2.3.1
|
| 21 |
-
torchvision>=0.17.2,<=0.18.1
|
| 22 |
-
rapid-table>=1.0.3,<2.0.0
|
| 23 |
-
rapidocr-paddle
|
| 24 |
-
rapidocr-onnxruntime
|
| 25 |
-
gradio-pdf>=0.0.21
|
| 26 |
-
openai
|
| 27 |
-
fastapi
|
| 28 |
-
uvicorn
|
| 29 |
-
python-multipart
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
boto3>=1.28.43
|
| 2 |
+
Brotli>=1.1.0
|
| 3 |
+
click>=8.1.7
|
| 4 |
+
PyMuPDF>=1.24.9,<1.24.14
|
| 5 |
+
loguru>=0.6.0
|
| 6 |
+
numpy>=1.21.6,<2.0.0
|
| 7 |
+
fast-langdetect>=0.2.3
|
| 8 |
+
scikit-learn>=1.0.2
|
| 9 |
+
transformers
|
| 10 |
+
pdfminer.six==20231228
|
| 11 |
+
unimernet==0.2.3
|
| 12 |
+
doclayout_yolo==0.0.2b1
|
| 13 |
+
matplotlib
|
| 14 |
+
ultralytics>=8.3.48
|
| 15 |
+
paddleocr==2.7.3
|
| 16 |
+
paddlepaddle-gpu @ https://paddle-whl.bj.bcebos.com/stable/cu118/paddlepaddle-gpu/paddlepaddle_gpu-3.0.0b1-cp310-cp310-linux_x86_64.whl
|
| 17 |
+
struct-eqtable==0.3.2
|
| 18 |
+
detectron2 @ https://wheels-1251341229.cos.ap-shanghai.myqcloud.com/assets/whl/detectron2/detectron2-0.6-cp310-cp310-linux_x86_64.whl
|
| 19 |
+
magic-pdf>=1.0.1
|
| 20 |
+
torch>=2.2.2,<=2.3.1
|
| 21 |
+
torchvision>=0.17.2,<=0.18.1
|
| 22 |
+
rapid-table>=1.0.3,<2.0.0
|
| 23 |
+
rapidocr-paddle
|
| 24 |
+
rapidocr-onnxruntime
|
| 25 |
+
gradio-pdf>=0.0.21
|
| 26 |
+
openai
|
| 27 |
+
fastapi
|
| 28 |
+
uvicorn
|
| 29 |
+
python-multipart
|
| 30 |
+
pika==1.3.2
|
| 31 |
+
joblib==1.4.2
|
| 32 |
+
opencv-python-headless==4.11.0.86
|
svm_model.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfd07af67fb52073a477bcded6ed710f402f492d4fbc11945fbab1a68f7ceb62
|
| 3 |
+
size 219034075
|
worker.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import os
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import threading
|
| 6 |
+
import multiprocessing
|
| 7 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
+
import pika
|
| 9 |
+
|
| 10 |
+
from mineru_single import Processor
|
| 11 |
+
|
| 12 |
+
processor = Processor()
|
| 13 |
+
|
| 14 |
+
def run_pipeline(body_bytes: bytes):
|
| 15 |
+
"""
|
| 16 |
+
1) Decode the body bytes to a string.
|
| 17 |
+
2) Parse the JSON. We expect something like:
|
| 18 |
+
{
|
| 19 |
+
"headers": {"request_type": "process_files", "request_id": "..."},
|
| 20 |
+
"body": {
|
| 21 |
+
"input_files": [...],
|
| 22 |
+
"topics": [...]
|
| 23 |
+
}
|
| 24 |
+
}
|
| 25 |
+
3) If request_type == "process_files", call processor.process_batch(...) on the URLs.
|
| 26 |
+
4) Return raw_text_outputs (str) and parsed_json_outputs (dict).
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
body_str = body_bytes.decode("utf-8")
|
| 30 |
+
data = json.loads(body_str)
|
| 31 |
+
|
| 32 |
+
headers = data.get("headers", {})
|
| 33 |
+
request_type = headers.get("request_type", "")
|
| 34 |
+
request_id = headers.get("request_id", "")
|
| 35 |
+
body = data.get("body", {})
|
| 36 |
+
|
| 37 |
+
# If it's not "process_files", we do nothing special
|
| 38 |
+
if request_type != "process_files":
|
| 39 |
+
return "No processing done", data
|
| 40 |
+
|
| 41 |
+
# Gather file URLs
|
| 42 |
+
input_files = body.get("input_files", [])
|
| 43 |
+
topics = body.get("topics", [])
|
| 44 |
+
|
| 45 |
+
urls = []
|
| 46 |
+
file_key_map = {}
|
| 47 |
+
for f in input_files:
|
| 48 |
+
key = f.get("key", "")
|
| 49 |
+
url = f.get("url", "")
|
| 50 |
+
urls.append(url)
|
| 51 |
+
file_key_map[url] = key
|
| 52 |
+
|
| 53 |
+
batch_results = processor.process_batch(urls) # {url: markdown_string}
|
| 54 |
+
|
| 55 |
+
md_context = []
|
| 56 |
+
for url, md_content in batch_results.items():
|
| 57 |
+
key = file_key_map.get(url, "")
|
| 58 |
+
md_context.append({"key": key, "body": md_content})
|
| 59 |
+
|
| 60 |
+
out_headers = {
|
| 61 |
+
"request_type": "question_extraction_update_from_gpu_server",
|
| 62 |
+
"request_id": request_id
|
| 63 |
+
}
|
| 64 |
+
out_body = {
|
| 65 |
+
"input_files": input_files,
|
| 66 |
+
"topics": topics,
|
| 67 |
+
"md_context": md_context
|
| 68 |
+
}
|
| 69 |
+
final_json = {
|
| 70 |
+
"headers": out_headers,
|
| 71 |
+
"body": out_body
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
return json.dumps(final_json, ensure_ascii=False), final_json
|
| 75 |
+
|
| 76 |
+
def callback(ch, method, properties, body):
|
| 77 |
+
"""
|
| 78 |
+
This function is invoked for each incoming RabbitMQ message.
|
| 79 |
+
"""
|
| 80 |
+
thread_id = threading.current_thread().name
|
| 81 |
+
headers = properties.headers or {}
|
| 82 |
+
|
| 83 |
+
print(f"[Worker {thread_id}] Received message: {body}, headers: {headers}")
|
| 84 |
+
|
| 85 |
+
# If the header "process" is "topic_extraction", we run our pipeline
|
| 86 |
+
if headers.get("process") == "topic_extraction":
|
| 87 |
+
raw_text_outputs, parsed_json_outputs = run_pipeline(body)
|
| 88 |
+
# Do something with the result, e.g. print or store in DB
|
| 89 |
+
print(f"[Worker {thread_id}] Pipeline result:\n{raw_text_outputs}")
|
| 90 |
+
else:
|
| 91 |
+
# Fallback if "process" is something else
|
| 92 |
+
print(f"[Worker {thread_id}] Unknown process, sleeping 10s.")
|
| 93 |
+
time.sleep(10)
|
| 94 |
+
print("[Worker] Done")
|
| 95 |
+
|
| 96 |
+
def worker(channel):
|
| 97 |
+
try:
|
| 98 |
+
channel.start_consuming()
|
| 99 |
+
except Exception as e:
|
| 100 |
+
print(f"[Worker] Error: {e}")
|
| 101 |
+
|
| 102 |
+
def connect_to_rabbitmq():
|
| 103 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
| 104 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
| 105 |
+
channel = connection.channel()
|
| 106 |
+
|
| 107 |
+
# Declare the queue
|
| 108 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
| 109 |
+
|
| 110 |
+
# Limit messages per worker
|
| 111 |
+
channel.basic_qos(prefetch_count=1)
|
| 112 |
+
|
| 113 |
+
# auto_ack=True for simplicity, else you must ack manually
|
| 114 |
+
channel.basic_consume(
|
| 115 |
+
queue="ml_server",
|
| 116 |
+
on_message_callback=callback,
|
| 117 |
+
auto_ack=True
|
| 118 |
+
)
|
| 119 |
+
return connection, channel
|
| 120 |
+
|
| 121 |
+
def main():
|
| 122 |
+
"""
|
| 123 |
+
Main entry: starts multiple worker threads to consume from the queue.
|
| 124 |
+
"""
|
| 125 |
+
num_workers = multiprocessing.cpu_count()
|
| 126 |
+
print(f"Starting {num_workers} workers")
|
| 127 |
+
|
| 128 |
+
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
| 129 |
+
for _ in range(num_workers):
|
| 130 |
+
connection, channel = connect_to_rabbitmq()
|
| 131 |
+
executor.submit(worker, channel)
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
"""
|
| 135 |
+
If run directly, we also publish a test message, then start the workers.
|
| 136 |
+
"""
|
| 137 |
+
rabbit_url = os.getenv("RABBITMQ_URL", "amqp://guest:guest@localhost:5672/")
|
| 138 |
+
connection = pika.BlockingConnection(pika.URLParameters(rabbit_url))
|
| 139 |
+
channel = connection.channel()
|
| 140 |
+
channel.queue_declare(queue="ml_server", durable=True)
|
| 141 |
+
|
| 142 |
+
sample_message = {
|
| 143 |
+
"headers": {
|
| 144 |
+
"request_type": "process_files",
|
| 145 |
+
"request_id": "abc123"
|
| 146 |
+
},
|
| 147 |
+
"body": {
|
| 148 |
+
"input_files": [
|
| 149 |
+
{
|
| 150 |
+
"key": "file1",
|
| 151 |
+
"url": "https://example.com/file1.pdf",
|
| 152 |
+
"type": "mark_scheme"
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"key": "file2",
|
| 156 |
+
"url": "https://example.com/file2.pdf",
|
| 157 |
+
"type": "question"
|
| 158 |
+
}
|
| 159 |
+
],
|
| 160 |
+
"topics": [
|
| 161 |
+
{
|
| 162 |
+
"title": "Algebra",
|
| 163 |
+
"id": 123
|
| 164 |
+
}
|
| 165 |
+
]
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
channel.basic_publish(
|
| 170 |
+
exchange="",
|
| 171 |
+
routing_key="ml_server",
|
| 172 |
+
body=json.dumps(sample_message),
|
| 173 |
+
properties=pika.BasicProperties(
|
| 174 |
+
headers={"process": "topic_extraction"}
|
| 175 |
+
)
|
| 176 |
+
)
|
| 177 |
+
connection.close()
|
| 178 |
+
|
| 179 |
+
main()
|