Spaces:
Sleeping
Sleeping
BlueSkyXN commited on
Commit ·
91b5bcf
1
Parent(s): 486a4a6
0.2.0
Browse files- Dockerfile +11 -31
- README.md +90 -33
- entrypoint.sh +8 -2
- main.py +134 -67
- requirements.txt +1 -3
- test/test.py +43 -0
Dockerfile
CHANGED
|
@@ -1,46 +1,26 @@
|
|
| 1 |
-
# 使用官方
|
| 2 |
-
FROM
|
| 3 |
|
| 4 |
-
# 设置环境变量
|
| 5 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
-
# 设置默认端口
|
| 7 |
ENV PORT=8000
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
# OCRmyPDF 需要这些系统依赖,即使我们通过 pip 安装 OCRmyPDF 本身
|
| 11 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
-
ghostscript \
|
| 13 |
-
tesseract-ocr \
|
| 14 |
-
tesseract-ocr-eng \
|
| 15 |
-
tesseract-ocr-chi-sim \
|
| 16 |
-
unpaper \
|
| 17 |
-
pngquant \
|
| 18 |
-
qpdf \
|
| 19 |
-
liblept5 \
|
| 20 |
-
libffi-dev \
|
| 21 |
-
# 编译依赖
|
| 22 |
-
build-essential \
|
| 23 |
-
python3-dev \
|
| 24 |
-
# 清理 apt 缓存以减小镜像大小
|
| 25 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 26 |
-
|
| 27 |
-
# 设置工作目录
|
| 28 |
WORKDIR /app
|
| 29 |
-
|
| 30 |
-
# 复制 Python 依赖文件
|
| 31 |
COPY requirements.txt .
|
| 32 |
-
|
| 33 |
-
# 安装 Python 依赖
|
| 34 |
-
# --no-cache-dir 减小镜像大小
|
| 35 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 36 |
|
| 37 |
-
# 复制
|
| 38 |
COPY main.py .
|
| 39 |
COPY entrypoint.sh .
|
| 40 |
|
| 41 |
-
# 设置
|
| 42 |
RUN chmod +x /app/entrypoint.sh
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# 暴露端口
|
| 45 |
EXPOSE 8000
|
| 46 |
|
|
|
|
| 1 |
+
# 使用官方OCRmyPDF Alpine镜像作为基础
|
| 2 |
+
FROM jbarlow83/ocrmypdf-alpine:latest
|
| 3 |
|
| 4 |
+
# 设置环境变量
|
|
|
|
|
|
|
| 5 |
ENV PORT=8000
|
| 6 |
+
ENV PYTHONUNBUFFERED=1
|
| 7 |
|
| 8 |
+
# 安装Python依赖
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
WORKDIR /app
|
|
|
|
|
|
|
| 10 |
COPY requirements.txt .
|
|
|
|
|
|
|
|
|
|
| 11 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
|
| 13 |
+
# 复制应用代码和启动脚本
|
| 14 |
COPY main.py .
|
| 15 |
COPY entrypoint.sh .
|
| 16 |
|
| 17 |
+
# 设置启动脚本权限
|
| 18 |
RUN chmod +x /app/entrypoint.sh
|
| 19 |
|
| 20 |
+
# 创建临时工作目录
|
| 21 |
+
RUN mkdir -p /app/temp
|
| 22 |
+
RUN chmod 777 /app/temp
|
| 23 |
+
|
| 24 |
# 暴露端口
|
| 25 |
EXPOSE 8000
|
| 26 |
|
README.md
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
---
|
| 2 |
-
# Hugging Face Spaces 所需的配置信息
|
| 3 |
title: OCRmyPDF API 接口 # 显示在 Space 页面的标题 (可自定义)
|
| 4 |
emoji: 📄 # Space 图标的 Emoji (可选)
|
| 5 |
colorFrom: blue # 主题颜色起始 (可选)
|
|
@@ -9,50 +8,108 @@ app_port: 8000 # 你的 FastAPI 应用在容器内部监听的端口 (必须与
|
|
| 9 |
pinned: false # 是否在你的个人资料页置顶这个 Space (可选)
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# OCRmyPDF API
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
##
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
* `language`: (必需) 用于 OCR 的语言。可选值:
|
| 28 |
-
* `eng` (仅英文)
|
| 29 |
-
* `chi_sim` (仅简体中文)
|
| 30 |
-
* `eng+chi_sim` (英文和简体中文 - **默认值**)
|
| 31 |
-
* `force_ocr`: (可选) `true` 或 `false`。即使文件看起来已有文本,是否强制进行 OCR? (默认: `false`)
|
| 32 |
-
* `deskew`: (可选) `true` 或 `false`。在 OCR 前是否进行图像歪斜校正? (默认: `false`)
|
| 33 |
-
* `optimize`: (可选) `0`, `1`, `2`, 或 `3`。PDF 优化级别 (0=无, 1=安全, 2=较强, 3=最强)。 (默认: `0` 以保证稳定性)。
|
| 34 |
|
| 35 |
-
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
|
| 43 |
-
|
| 44 |
-
* Content-Type: `application/json`
|
| 45 |
-
* 响应体: 包含错误详情的 JSON 对象。
|
| 46 |
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
-
* `/supported-languages/`: GET - 返回支持的语言参数列表。
|
| 51 |
|
| 52 |
-
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
|
|
|
| 2 |
title: OCRmyPDF API 接口 # 显示在 Space 页面的标题 (可自定义)
|
| 3 |
emoji: 📄 # Space 图标的 Emoji (可选)
|
| 4 |
colorFrom: blue # 主题颜色起始 (可选)
|
|
|
|
| 8 |
pinned: false # 是否在你的个人资料页置顶这个 Space (可选)
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# OCRmyPDF API 服务
|
| 12 |
|
| 13 |
+
本项目提供一个基于FastAPI的REST API,用于通过OCRmyPDF对PDF文件进行OCR处理,添加可搜索的文本层。API支持中文和英文OCR识别。
|
| 14 |
|
| 15 |
+
## 部署到Hugging Face Spaces
|
| 16 |
|
| 17 |
+
### 方法1:直接从GitHub仓库部署
|
| 18 |
|
| 19 |
+
1. 登录Hugging Face账户
|
| 20 |
+
2. 创建新的Space:
|
| 21 |
+
- 点击"Create New Space"
|
| 22 |
+
- 输入名称,例如"ocrmypdf-api"
|
| 23 |
+
- 选择"Docker"作为Space SDK
|
| 24 |
+
- 选择适当的硬件规格(推荐:CPU-M或更高配置,以处理大型PDF)
|
| 25 |
+
- 输入GitHub仓库URL
|
| 26 |
+
- 点击"Create Space"
|
| 27 |
|
| 28 |
+
### 方法2:手动上传文件
|
| 29 |
|
| 30 |
+
1. 创建新的Space,选择"Docker"作为Space SDK
|
| 31 |
+
2. 上传以下文件到Space:
|
| 32 |
+
- `Dockerfile`
|
| 33 |
+
- `requirements.txt`
|
| 34 |
+
- `main.py`
|
| 35 |
+
- `entrypoint.sh`
|
| 36 |
+
- `README.md`(可选)
|
| 37 |
+
3. Space会自动构建Docker镜像并启动服务
|
| 38 |
|
| 39 |
+
## API使用说明
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
### 端点
|
| 42 |
|
| 43 |
+
- `GET /` - API根检查
|
| 44 |
+
- `GET /health` - 健康检查,返回OCRmyPDF和Tesseract版本信息
|
| 45 |
+
- `GET /supported-languages/` - 查询支持的语言
|
| 46 |
+
- `POST /ocr/` - 处理PDF文件
|
| 47 |
|
| 48 |
+
### 示例请求
|
| 49 |
|
| 50 |
+
使用cURL:
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
```bash
|
| 53 |
+
curl -X POST "https://your-space-name.hf.space/ocr/" \
|
| 54 |
+
-H "accept: application/json" \
|
| 55 |
+
-H "Content-Type: multipart/form-data" \
|
| 56 |
+
-F "pdf_file=@your_file.pdf" \
|
| 57 |
+
-F "language=eng+chi_sim" \
|
| 58 |
+
-F "force_ocr=false" \
|
| 59 |
+
-F "deskew=true" \
|
| 60 |
+
-F "optimize=1" \
|
| 61 |
+
--output processed.pdf
|
| 62 |
+
```
|
| 63 |
|
| 64 |
+
使用Python:
|
|
|
|
| 65 |
|
| 66 |
+
```python
|
| 67 |
+
import requests
|
| 68 |
|
| 69 |
+
url = "https://your-space-name.hf.space/ocr/"
|
| 70 |
+
|
| 71 |
+
payload = {
|
| 72 |
+
'language': 'eng+chi_sim',
|
| 73 |
+
'force_ocr': 'false',
|
| 74 |
+
'deskew': 'true',
|
| 75 |
+
'optimize': '1'
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
files = {
|
| 79 |
+
'pdf_file': open('your_file.pdf', 'rb')
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
response = requests.post(url, data=payload, files=files)
|
| 83 |
+
|
| 84 |
+
# 保存处理后的PDF
|
| 85 |
+
with open('processed.pdf', 'wb') as f:
|
| 86 |
+
f.write(response.content)
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## 参数说明
|
| 90 |
+
|
| 91 |
+
| 参数 | 类型 | 默认值 | 描述 |
|
| 92 |
+
|------|------|--------|------|
|
| 93 |
+
| language | string | "eng+chi_sim" | OCR语言,可选: "eng"(英文), "chi_sim"(简体中文), "eng+chi_sim"(中英文) |
|
| 94 |
+
| force_ocr | boolean | false | 是否强制对所有页面进行OCR处理,即使已包含文本 |
|
| 95 |
+
| deskew | boolean | false | 是否在OCR前自动校正倾斜的页面 |
|
| 96 |
+
| optimize | integer | 0 | PDF优化级别: 0=不优化, 1=安全优化, 2=强力优化, 3=最大优化 |
|
| 97 |
+
|
| 98 |
+
## 资源限制
|
| 99 |
+
|
| 100 |
+
- 最大文件大小: 200MB
|
| 101 |
+
- 最大页数: 1000页
|
| 102 |
+
- 处理超时: 1800秒(30分钟)
|
| 103 |
+
|
| 104 |
+
## 性能注意事项
|
| 105 |
+
|
| 106 |
+
- 大型PDF文件处理可能需要较长时间
|
| 107 |
+
- 高优化级别(2-3)会显著增加处理时间和资源消耗
|
| 108 |
+
- 如遇到超时问题,请尝试减小文件大小或降低优化级别
|
| 109 |
+
|
| 110 |
+
## 技术实现
|
| 111 |
+
|
| 112 |
+
本服务基于:
|
| 113 |
+
- OCRmyPDF官方Docker镜像
|
| 114 |
+
- FastAPI框架
|
| 115 |
+
- Tesseract OCR引擎(支持英文和简体中文)
|
entrypoint.sh
CHANGED
|
@@ -1,9 +1,15 @@
|
|
| 1 |
-
#!/bin/
|
| 2 |
|
| 3 |
# 打印环境信息用于调试
|
| 4 |
-
echo "Starting OCRmyPDF API"
|
| 5 |
echo "Environment: PORT=$PORT"
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
# 确保使用正确的端口变量
|
| 8 |
PORT="${PORT:-8000}"
|
| 9 |
echo "Using port: $PORT"
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
|
| 3 |
# 打印环境信息用于调试
|
| 4 |
+
echo "Starting OCRmyPDF API Service"
|
| 5 |
echo "Environment: PORT=$PORT"
|
| 6 |
|
| 7 |
+
# 验证OCRmyPDF是否可用
|
| 8 |
+
echo "Checking OCRmyPDF installation..."
|
| 9 |
+
ocrmypdf --version
|
| 10 |
+
echo "Checking Tesseract installation..."
|
| 11 |
+
tesseract --version | head -n 1
|
| 12 |
+
|
| 13 |
# 确保使用正确的端口变量
|
| 14 |
PORT="${PORT:-8000}"
|
| 15 |
echo "Using port: $PORT"
|
main.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
import fastapi
|
| 2 |
-
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Response
|
| 3 |
-
from fastapi.responses import FileResponse
|
|
|
|
| 4 |
import tempfile
|
| 5 |
import os
|
| 6 |
import shutil
|
| 7 |
import logging
|
| 8 |
import uuid
|
| 9 |
import PyPDF2
|
| 10 |
-
from typing import Literal
|
| 11 |
-
import ocrmypdf # 直接导入OCRmyPDF的Python API
|
| 12 |
|
| 13 |
# 配置日志记录
|
| 14 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
@@ -21,13 +21,14 @@ SUPPORTED_LANG_ARGS = {
|
|
| 21 |
"chi_sim": "Simplified Chinese only",
|
| 22 |
"eng+chi_sim": "English and Simplified Chinese"
|
| 23 |
}
|
| 24 |
-
DEFAULT_LANGUAGE_ARG = "eng+chi_sim"
|
| 25 |
-
ALLOWED_LANGUAGES = Literal["eng", "chi_sim", "eng+chi_sim"]
|
| 26 |
|
| 27 |
# 资源限制配置
|
| 28 |
MAX_FILE_SIZE_MB = 200 # 最大文件大小,MB
|
| 29 |
MAX_PAGES = 1000 # 最大页数
|
| 30 |
TIMEOUT_SECONDS = 1800 # OCR 处理超时时间,秒
|
|
|
|
| 31 |
# ----------------
|
| 32 |
|
| 33 |
# 初始化 FastAPI 应用
|
|
@@ -37,29 +38,58 @@ app = FastAPI(
|
|
| 37 |
version="1.0.0"
|
| 38 |
)
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
@app.get("/", summary="API Root Check")
|
| 41 |
async def read_root():
|
| 42 |
-
"""
|
| 43 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
@app.get("/health", summary="Health Check")
|
| 46 |
async def health_check():
|
| 47 |
-
"""
|
| 48 |
try:
|
| 49 |
-
# 检查OCRmyPDF
|
| 50 |
-
|
|
|
|
| 51 |
|
| 52 |
-
# 检查Tesseract
|
| 53 |
-
import subprocess
|
| 54 |
tesseract_result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True, timeout=5)
|
| 55 |
tesseract_version = tesseract_result.stdout.split('\n')[0] if tesseract_result.returncode == 0 else "Not available"
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# 返回健康状态
|
| 58 |
return {
|
| 59 |
"status": "healthy",
|
| 60 |
"ocrmypdf": ocrmypdf_version,
|
| 61 |
"tesseract": tesseract_version,
|
| 62 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
except Exception as e:
|
| 65 |
logger.error(f"Health check failed: {str(e)}")
|
|
@@ -70,12 +100,12 @@ async def health_check():
|
|
| 70 |
|
| 71 |
@app.get("/supported-languages/", summary="List Supported Languages")
|
| 72 |
async def get_supported_languages():
|
| 73 |
-
"""
|
| 74 |
return SUPPORTED_LANG_ARGS
|
| 75 |
|
| 76 |
@app.post("/ocr/",
|
| 77 |
summary="Perform OCR on PDF",
|
| 78 |
-
response_class=FileResponse,
|
| 79 |
responses={
|
| 80 |
200: {
|
| 81 |
"content": {"application/pdf": {}},
|
|
@@ -91,13 +121,13 @@ async def run_ocr_on_pdf(
|
|
| 91 |
pdf_file: UploadFile = File(..., description="The PDF file to be processed."),
|
| 92 |
force_ocr: bool = Form(False, description="Force OCR even if text seems present?"),
|
| 93 |
deskew: bool = Form(False, description="Deskew the image before OCR?"),
|
| 94 |
-
optimize: int = Form(0, description="PDF optimization level (0=None, 1=Safe, 2=Strong, 3=Max)
|
|
|
|
| 95 |
):
|
| 96 |
"""
|
| 97 |
-
|
| 98 |
-
and returns the processed PDF file.
|
| 99 |
"""
|
| 100 |
-
logger.info(f"Received request
|
| 101 |
|
| 102 |
# 基本文件验证
|
| 103 |
if not pdf_file.filename.lower().endswith(".pdf"):
|
|
@@ -114,10 +144,13 @@ async def run_ocr_on_pdf(
|
|
| 114 |
)
|
| 115 |
|
| 116 |
# 创建唯一的临时工作目录
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
| 118 |
# 在临时目录中定义输入和输出文件的路径
|
| 119 |
-
input_filename = f"input_{
|
| 120 |
-
output_filename = f"output_{
|
| 121 |
input_path = os.path.join(temp_dir, input_filename)
|
| 122 |
output_path = os.path.join(temp_dir, output_filename)
|
| 123 |
|
|
@@ -150,57 +183,84 @@ async def run_ocr_on_pdf(
|
|
| 150 |
logger.error(f"Error checking PDF pages: {str(e)}")
|
| 151 |
# 继续处理,不中断流程
|
| 152 |
|
| 153 |
-
#
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
# 检查
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
if not os.path.exists(output_path):
|
| 188 |
-
error_message = "OCR
|
| 189 |
logger.error(error_message)
|
| 190 |
raise HTTPException(status_code=500, detail=error_message)
|
| 191 |
|
| 192 |
-
# OCR 成功
|
| 193 |
logger.info(f"OCR successful. Output file generated at '{output_path}'")
|
|
|
|
| 194 |
# 生成友好的下载文件名
|
| 195 |
download_filename = f"ocr_{pdf_file.filename}" if pdf_file.filename else "processed_document.pdf"
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
# 返回处理后的文件
|
| 198 |
return FileResponse(
|
| 199 |
path=output_path,
|
| 200 |
media_type='application/pdf',
|
| 201 |
-
filename=download_filename
|
|
|
|
| 202 |
)
|
| 203 |
|
|
|
|
|
|
|
|
|
|
| 204 |
except HTTPException as http_exc:
|
| 205 |
# 重新抛出已知的 HTTP 异常
|
| 206 |
raise http_exc
|
|
@@ -209,16 +269,23 @@ async def run_ocr_on_pdf(
|
|
| 209 |
logger.error(f"An unexpected error occurred during OCR processing for file '{pdf_file.filename}': {e}", exc_info=True)
|
| 210 |
raise HTTPException(status_code=500, detail=f"An unexpected server error occurred. Please try again later.")
|
| 211 |
finally:
|
| 212 |
-
# 无论成功与否,都清理临时目录及其内容
|
| 213 |
-
if os.path.exists(temp_dir):
|
| 214 |
-
logger.info(f"Cleaning up temporary directory: {temp_dir}")
|
| 215 |
-
try:
|
| 216 |
-
shutil.rmtree(temp_dir)
|
| 217 |
-
logger.info("Temporary directory cleaned up successfully.")
|
| 218 |
-
except Exception as cleanup_error:
|
| 219 |
-
logger.error(f"Error cleaning up temporary directory {temp_dir}: {cleanup_error}", exc_info=True)
|
| 220 |
# 确保关闭上传的文件句柄
|
| 221 |
await pdf_file.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
async def get_upload_file_size(upload_file: UploadFile) -> int:
|
| 224 |
"""获取上传文件的大小(以字节为单位)"""
|
|
|
|
| 1 |
import fastapi
|
| 2 |
+
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Response, BackgroundTasks
|
| 3 |
+
from fastapi.responses import FileResponse, JSONResponse
|
| 4 |
+
import subprocess
|
| 5 |
import tempfile
|
| 6 |
import os
|
| 7 |
import shutil
|
| 8 |
import logging
|
| 9 |
import uuid
|
| 10 |
import PyPDF2
|
| 11 |
+
from typing import Literal, Optional, List
|
|
|
|
| 12 |
|
| 13 |
# 配置日志记录
|
| 14 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
| 21 |
"chi_sim": "Simplified Chinese only",
|
| 22 |
"eng+chi_sim": "English and Simplified Chinese"
|
| 23 |
}
|
| 24 |
+
DEFAULT_LANGUAGE_ARG = "eng+chi_sim" # 默认处理中英文混合
|
| 25 |
+
ALLOWED_LANGUAGES = Literal["eng", "chi_sim", "eng+chi_sim"] # FastAPI 类型提示
|
| 26 |
|
| 27 |
# 资源限制配置
|
| 28 |
MAX_FILE_SIZE_MB = 200 # 最大文件大小,MB
|
| 29 |
MAX_PAGES = 1000 # 最大页数
|
| 30 |
TIMEOUT_SECONDS = 1800 # OCR 处理超时时间,秒
|
| 31 |
+
TEMP_DIR = "/app/temp" # 临时文件目录
|
| 32 |
# ----------------
|
| 33 |
|
| 34 |
# 初始化 FastAPI 应用
|
|
|
|
| 38 |
version="1.0.0"
|
| 39 |
)
|
| 40 |
|
| 41 |
+
# 确保临时目录存在
|
| 42 |
+
os.makedirs(TEMP_DIR, exist_ok=True)
|
| 43 |
+
|
| 44 |
@app.get("/", summary="API Root Check")
|
| 45 |
async def read_root():
|
| 46 |
+
"""提供简单的API可用性检查"""
|
| 47 |
+
return {
|
| 48 |
+
"status": "running",
|
| 49 |
+
"service": "OCRmyPDF API",
|
| 50 |
+
"endpoints": {
|
| 51 |
+
"POST /ocr/": "OCR处理PDF文件",
|
| 52 |
+
"GET /health": "健康检查",
|
| 53 |
+
"GET /supported-languages/": "查询支持的语言"
|
| 54 |
+
},
|
| 55 |
+
"supported_languages": list(SUPPORTED_LANG_ARGS.keys())
|
| 56 |
+
}
|
| 57 |
|
| 58 |
@app.get("/health", summary="Health Check")
|
| 59 |
async def health_check():
|
| 60 |
+
"""提供详细的API和依赖健康状态检查"""
|
| 61 |
try:
|
| 62 |
+
# 检查 OCRmyPDF 是否可用
|
| 63 |
+
result = subprocess.run(['ocrmypdf', '--version'], capture_output=True, text=True, timeout=5)
|
| 64 |
+
ocrmypdf_version = result.stdout.strip() if result.returncode == 0 else "Not available"
|
| 65 |
|
| 66 |
+
# 检查 Tesseract 是否可用
|
|
|
|
| 67 |
tesseract_result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True, timeout=5)
|
| 68 |
tesseract_version = tesseract_result.stdout.split('\n')[0] if tesseract_result.returncode == 0 else "Not available"
|
| 69 |
|
| 70 |
+
# 检查支持的语言
|
| 71 |
+
langs_result = subprocess.run(['tesseract', '--list-langs'], capture_output=True, text=True, timeout=5)
|
| 72 |
+
available_langs = langs_result.stdout.strip().split('\n')[1:] if langs_result.returncode == 0 else []
|
| 73 |
+
|
| 74 |
+
# 检查磁盘空间
|
| 75 |
+
disk_info = os.statvfs(TEMP_DIR)
|
| 76 |
+
free_space_mb = (disk_info.f_bavail * disk_info.f_frsize) / (1024 * 1024)
|
| 77 |
+
|
| 78 |
# 返回健康状态
|
| 79 |
return {
|
| 80 |
"status": "healthy",
|
| 81 |
"ocrmypdf": ocrmypdf_version,
|
| 82 |
"tesseract": tesseract_version,
|
| 83 |
+
"available_languages": available_langs,
|
| 84 |
+
"disk_space": {
|
| 85 |
+
"free_mb": round(free_space_mb, 2),
|
| 86 |
+
"temp_dir": TEMP_DIR
|
| 87 |
+
},
|
| 88 |
+
"resource_limits": {
|
| 89 |
+
"max_file_size_mb": MAX_FILE_SIZE_MB,
|
| 90 |
+
"max_pages": MAX_PAGES,
|
| 91 |
+
"timeout_seconds": TIMEOUT_SECONDS
|
| 92 |
+
}
|
| 93 |
}
|
| 94 |
except Exception as e:
|
| 95 |
logger.error(f"Health check failed: {str(e)}")
|
|
|
|
| 100 |
|
| 101 |
@app.get("/supported-languages/", summary="List Supported Languages")
|
| 102 |
async def get_supported_languages():
|
| 103 |
+
"""返回支持的语言参数及其描述的字典"""
|
| 104 |
return SUPPORTED_LANG_ARGS
|
| 105 |
|
| 106 |
@app.post("/ocr/",
|
| 107 |
summary="Perform OCR on PDF",
|
| 108 |
+
response_class=FileResponse,
|
| 109 |
responses={
|
| 110 |
200: {
|
| 111 |
"content": {"application/pdf": {}},
|
|
|
|
| 121 |
pdf_file: UploadFile = File(..., description="The PDF file to be processed."),
|
| 122 |
force_ocr: bool = Form(False, description="Force OCR even if text seems present?"),
|
| 123 |
deskew: bool = Form(False, description="Deskew the image before OCR?"),
|
| 124 |
+
optimize: int = Form(0, description="PDF optimization level (0=None, 1=Safe, 2=Strong, 3=Max)"),
|
| 125 |
+
background_tasks: BackgroundTasks = None
|
| 126 |
):
|
| 127 |
"""
|
| 128 |
+
接收PDF文件,使用指定的语言进行OCR处理,并返回处理后的PDF文件。
|
|
|
|
| 129 |
"""
|
| 130 |
+
logger.info(f"Received request: filename={pdf_file.filename}, language={language}, force_ocr={force_ocr}, deskew={deskew}, optimize={optimize}")
|
| 131 |
|
| 132 |
# 基本文件验证
|
| 133 |
if not pdf_file.filename.lower().endswith(".pdf"):
|
|
|
|
| 144 |
)
|
| 145 |
|
| 146 |
# 创建唯一的临时工作目录
|
| 147 |
+
session_id = str(uuid.uuid4())
|
| 148 |
+
temp_dir = os.path.join(TEMP_DIR, session_id)
|
| 149 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 150 |
+
|
| 151 |
# 在临时目录中定义输入和输出文件的路径
|
| 152 |
+
input_filename = f"input_{session_id}.pdf"
|
| 153 |
+
output_filename = f"output_{session_id}.pdf"
|
| 154 |
input_path = os.path.join(temp_dir, input_filename)
|
| 155 |
output_path = os.path.join(temp_dir, output_filename)
|
| 156 |
|
|
|
|
| 183 |
logger.error(f"Error checking PDF pages: {str(e)}")
|
| 184 |
# 继续处理,不中断流程
|
| 185 |
|
| 186 |
+
# 构建 ocrmypdf 命令列表 - 利用镜像中预装的ocrmypdf
|
| 187 |
+
cmd = [
|
| 188 |
+
'ocrmypdf',
|
| 189 |
+
'-l', language, # 语言参数
|
| 190 |
+
'--jobs', '2', # 并行处理线程,根据资源调整
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
# 根据用户选项添加参数
|
| 194 |
+
if force_ocr:
|
| 195 |
+
cmd.append('--force-ocr')
|
| 196 |
+
else:
|
| 197 |
+
# 默认跳过已有文本的页面
|
| 198 |
+
cmd.append('--skip-text')
|
| 199 |
+
|
| 200 |
+
if deskew:
|
| 201 |
+
cmd.append('--deskew')
|
| 202 |
+
|
| 203 |
+
if optimize >= 0 and optimize <= 3:
|
| 204 |
+
cmd.extend(['--optimize', str(optimize)])
|
| 205 |
+
|
| 206 |
+
# 添加输入和输出文件路径
|
| 207 |
+
cmd.extend([input_path, output_path])
|
| 208 |
+
|
| 209 |
+
command_str = ' '.join(cmd)
|
| 210 |
+
logger.info(f"Executing command: {command_str}")
|
| 211 |
+
|
| 212 |
+
# 执行命令,设置超时
|
| 213 |
+
result = subprocess.run(
|
| 214 |
+
cmd,
|
| 215 |
+
capture_output=True,
|
| 216 |
+
text=True,
|
| 217 |
+
check=False,
|
| 218 |
+
timeout=TIMEOUT_SECONDS
|
| 219 |
+
)
|
| 220 |
|
| 221 |
+
# 检查命令执行结果
|
| 222 |
+
if result.returncode != 0:
|
| 223 |
+
# 处理已有OCR文本的情况
|
| 224 |
+
if "PriorOcrFoundError" in result.stderr:
|
| 225 |
+
logger.info("Document already contains OCR text. Returning original document.")
|
| 226 |
+
shutil.copy(input_path, output_path)
|
| 227 |
+
# 处理加密PDF的情况
|
| 228 |
+
elif "EncryptedPdfError" in result.stderr:
|
| 229 |
+
logger.error("PDF is encrypted and cannot be processed")
|
| 230 |
+
raise HTTPException(status_code=400, detail="Cannot process encrypted PDF. Please remove password protection first.")
|
| 231 |
+
# 其他错误
|
| 232 |
+
else:
|
| 233 |
+
error_message = f"OCRmyPDF failed with exit code {result.returncode}."
|
| 234 |
+
logger.error(f"{error_message}\nStderr: {result.stderr[:1000]}\nStdout: {result.stdout[:1000]}")
|
| 235 |
+
raise HTTPException(status_code=500, detail="OCR processing failed. Please check your PDF file or try different parameters.")
|
| 236 |
+
|
| 237 |
+
# 验证输出文件存在
|
| 238 |
if not os.path.exists(output_path):
|
| 239 |
+
error_message = "OCR command seemed successful but output file was not found."
|
| 240 |
logger.error(error_message)
|
| 241 |
raise HTTPException(status_code=500, detail=error_message)
|
| 242 |
|
| 243 |
+
# OCR 成功
|
| 244 |
logger.info(f"OCR successful. Output file generated at '{output_path}'")
|
| 245 |
+
|
| 246 |
# 生成友好的下载文件名
|
| 247 |
download_filename = f"ocr_{pdf_file.filename}" if pdf_file.filename else "processed_document.pdf"
|
| 248 |
|
| 249 |
+
# 注册清理临时目录的后台任务
|
| 250 |
+
if background_tasks:
|
| 251 |
+
background_tasks.add_task(cleanup_temp_dir, temp_dir)
|
| 252 |
+
|
| 253 |
# 返回处理后的文件
|
| 254 |
return FileResponse(
|
| 255 |
path=output_path,
|
| 256 |
media_type='application/pdf',
|
| 257 |
+
filename=download_filename,
|
| 258 |
+
background=background_tasks
|
| 259 |
)
|
| 260 |
|
| 261 |
+
except subprocess.TimeoutExpired:
|
| 262 |
+
logger.error(f"OCR processing timed out after {TIMEOUT_SECONDS} seconds for file '{pdf_file.filename}'.")
|
| 263 |
+
raise HTTPException(status_code=504, detail=f"OCR processing took too long and timed out after {TIMEOUT_SECONDS} seconds. Try with a smaller file or disable heavy options.")
|
| 264 |
except HTTPException as http_exc:
|
| 265 |
# 重新抛出已知的 HTTP 异常
|
| 266 |
raise http_exc
|
|
|
|
| 269 |
logger.error(f"An unexpected error occurred during OCR processing for file '{pdf_file.filename}': {e}", exc_info=True)
|
| 270 |
raise HTTPException(status_code=500, detail=f"An unexpected server error occurred. Please try again later.")
|
| 271 |
finally:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
# 确保关闭上传的文件句柄
|
| 273 |
await pdf_file.close()
|
| 274 |
+
|
| 275 |
+
# 清理临时目录会由后台任务处理,这里不需要额外操作
|
| 276 |
+
# 如果没有注册后台任务,则在这里清理
|
| 277 |
+
if not background_tasks and os.path.exists(temp_dir):
|
| 278 |
+
cleanup_temp_dir(temp_dir)
|
| 279 |
+
|
| 280 |
+
def cleanup_temp_dir(temp_dir: str):
|
| 281 |
+
"""清理临时目录及其内容的辅助函数"""
|
| 282 |
+
try:
|
| 283 |
+
if os.path.exists(temp_dir):
|
| 284 |
+
logger.info(f"Cleaning up temporary directory: {temp_dir}")
|
| 285 |
+
shutil.rmtree(temp_dir)
|
| 286 |
+
logger.info("Temporary directory cleaned up successfully.")
|
| 287 |
+
except Exception as cleanup_error:
|
| 288 |
+
logger.error(f"Error cleaning up temporary directory {temp_dir}: {cleanup_error}", exc_info=True)
|
| 289 |
|
| 290 |
async def get_upload_file_size(upload_file: UploadFile) -> int:
|
| 291 |
"""获取上传文件的大小(以字节为单位)"""
|
requirements.txt
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
fastapi==0.95.0
|
| 2 |
uvicorn[standard]==0.22.0
|
| 3 |
python-multipart==0.0.6
|
| 4 |
-
PyPDF2==3.0.1
|
| 5 |
-
# 直接通过pip安装OCRmyPDF
|
| 6 |
-
ocrmypdf==15.4.3
|
|
|
|
| 1 |
fastapi==0.95.0
|
| 2 |
uvicorn[standard]==0.22.0
|
| 3 |
python-multipart==0.0.6
|
| 4 |
+
PyPDF2==3.0.1
|
|
|
|
|
|
test/test.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
# API端点
|
| 6 |
+
api_url = "https://blueskyxn-ocrmypdf-hfs.hf.space/ocr/"
|
| 7 |
+
pdf_path = r"F:\Download\20250401-113339.pdf"
|
| 8 |
+
output_path = r"F:\Download\ocr_result_python.pdf"
|
| 9 |
+
|
| 10 |
+
# 准备文件和参数
|
| 11 |
+
files = {"pdf_file": open(pdf_path, "rb")}
|
| 12 |
+
data = {
|
| 13 |
+
"language": "eng+chi_sim",
|
| 14 |
+
"deskew": "true",
|
| 15 |
+
"optimize": "1"
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
print(f"开始处理文件: {pdf_path}")
|
| 19 |
+
print(f"文件大小: {os.path.getsize(pdf_path)/1024/1024:.2f} MB")
|
| 20 |
+
start_time = time.time()
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
# 发送请求
|
| 24 |
+
print("正在发送请求到OCR API...")
|
| 25 |
+
response = requests.post(api_url, files=files, data=data)
|
| 26 |
+
|
| 27 |
+
# 处理响应
|
| 28 |
+
if response.status_code == 200:
|
| 29 |
+
# 保存处理后的PDF
|
| 30 |
+
with open(output_path, "wb") as f:
|
| 31 |
+
f.write(response.content)
|
| 32 |
+
print(f"PDF处理成功!耗时: {time.time() - start_time:.2f}秒")
|
| 33 |
+
print(f"结果已保存到: {output_path}")
|
| 34 |
+
else:
|
| 35 |
+
print(f"处理失败! 状态码: {response.status_code}")
|
| 36 |
+
try:
|
| 37 |
+
error_details = response.json()
|
| 38 |
+
print(f"错误详情: {error_details}")
|
| 39 |
+
except:
|
| 40 |
+
print(f"响应内容: {response.text[:500]}...")
|
| 41 |
+
finally:
|
| 42 |
+
# 确保关闭文件
|
| 43 |
+
files["pdf_file"].close()
|