Spaces:
Running
Running
Commit ·
e2dd6bd
0
Parent(s):
clean: fresh lightweight repo
Browse files- .gitignore +3 -0
- Dockerfile +25 -0
- app.py +1150 -0
- requirements.txt +18 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models/
|
| 2 |
+
inputs/
|
| 3 |
+
__pycache__/
|
Dockerfile
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12
|
| 2 |
+
WORKDIR /app
|
| 3 |
+
|
| 4 |
+
RUN pip install --no-cache-dir --timeout 120 fastapi uvicorn requests pyyaml numpy httpx python-docx openpyxl python-pptx pdfplumber "xlrd==1.2.0" tavily-python ebooklib beautifulsoup4 mobi
|
| 5 |
+
RUN pip install --no-cache-dir --timeout 120 torch --index-url https://download.pytorch.org/whl/cpu
|
| 6 |
+
RUN pip install --no-cache-dir --timeout 120 sentence-transformers faiss-cpu
|
| 7 |
+
# llama-cpp-python 预编译 wheel(30 秒,不需编译)
|
| 8 |
+
RUN pip install --no-cache-dir --timeout 300 "llama-cpp-python>=0.3.4" \
|
| 9 |
+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
| 10 |
+
|
| 11 |
+
# 模型在 app.py startup 时按需下载(避免每次代码变更重下 4GB)
|
| 12 |
+
# 全部源码、工具、配置 → 构建进镜像
|
| 13 |
+
COPY app.py /app/
|
| 14 |
+
COPY scripts /app/scripts/
|
| 15 |
+
COPY services /app/services/
|
| 16 |
+
COPY skills /app/skills/
|
| 17 |
+
|
| 18 |
+
# 自动拉取第三方技能(@ai install-skill 安装后自动进下次构建)
|
| 19 |
+
RUN git clone --depth 1 --single-branch https://github.com/hughyonng/OpenWolf.git /tmp/update \
|
| 20 |
+
&& cp -r /tmp/update/skills/third-party /app/skills/ 2>/dev/null || true \
|
| 21 |
+
&& cp -r /tmp/update/skills/library /app/skills/ 2>/dev/null || true \
|
| 22 |
+
&& rm -rf /tmp/update
|
| 23 |
+
|
| 24 |
+
EXPOSE 7860
|
| 25 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
|
@@ -0,0 +1,1150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
OpenWolf HF Spaces — FastAPI 入口
|
| 3 |
+
ML 依赖在 startup 时自动安装,保持 Docker 构建轻量
|
| 4 |
+
"""
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import json
|
| 8 |
+
import asyncio
|
| 9 |
+
import time
|
| 10 |
+
import threading
|
| 11 |
+
import uuid
|
| 12 |
+
import requests
|
| 13 |
+
import re
|
| 14 |
+
import hashlib
|
| 15 |
+
import random
|
| 16 |
+
import base64
|
| 17 |
+
import shutil
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
|
| 20 |
+
import concurrent.futures
|
| 21 |
+
from fastapi.responses import JSONResponse
|
| 22 |
+
|
| 23 |
+
sys.path.insert(0, "/app")
|
| 24 |
+
|
| 25 |
+
# ── 设默认环境变量 ──
|
| 26 |
+
os.environ.setdefault("ISSUE_NUMBER", "0")
|
| 27 |
+
os.environ.setdefault("COMMENT_BODY", "")
|
| 28 |
+
os.environ.setdefault("COMMENT_USER", "spaces")
|
| 29 |
+
os.environ.setdefault("GITHUB_REPO", "hughyonng/OpenWolf")
|
| 30 |
+
os.environ.setdefault("GITHUB_TOKEN", os.environ.get("GITHUB_PAT", ""))
|
| 31 |
+
os.environ.setdefault("OPENWOLF_PAT", os.environ.get("GITHUB_PAT", ""))
|
| 32 |
+
os.environ.setdefault("TELEGRAM_BOT_TOKEN", "")
|
| 33 |
+
os.environ.setdefault("TELEGRAM_CHAT_ID", "")
|
| 34 |
+
|
| 35 |
+
app = FastAPI(title="OpenWolf Agent with Cloud Acceleration")
|
| 36 |
+
|
| 37 |
+
@app.exception_handler(Exception)
|
| 38 |
+
async def _catch_all(request: Request, exc: Exception):
|
| 39 |
+
print(f"[FATAL] {request.method} {request.url.path}: {exc}")
|
| 40 |
+
return JSONResponse({"ok": False, "error": str(exc)}, status_code=500)
|
| 41 |
+
|
| 42 |
+
_ready = False
|
| 43 |
+
_model_loading = False
|
| 44 |
+
_model_loaded = False
|
| 45 |
+
_background_tasks = set()
|
| 46 |
+
_extract_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1, thread_name_prefix="extract")
|
| 47 |
+
_translate_pool = concurrent.futures.ThreadPoolExecutor(max_workers=2, thread_name_prefix="translate")
|
| 48 |
+
|
| 49 |
+
# GGUF 模型及全局状态
|
| 50 |
+
_llama_model = None
|
| 51 |
+
_llama_lock = threading.Lock()
|
| 52 |
+
_infer_lock = threading.Lock()
|
| 53 |
+
_translate_tasks = {}
|
| 54 |
+
_analyze_tasks = {}
|
| 55 |
+
_analyze_pool = concurrent.futures.ThreadPoolExecutor(max_workers=2, thread_name_prefix="analyze")
|
| 56 |
+
_task_tasks = {}
|
| 57 |
+
_task_pool = concurrent.futures.ThreadPoolExecutor(max_workers=2, thread_name_prefix="text_task")
|
| 58 |
+
|
| 59 |
+
PAGES_PER_CHUNK = 6 # 扫描版 PDF 懒加载 OCR 提取时,每块处理的页数
|
| 60 |
+
|
| 61 |
+
# ══════════════════════════════════════════════════════════════════
|
| 62 |
+
# ModelScope 每日额度持久化管理器
|
| 63 |
+
# ══════════════════════════════════════════════════════════════════
|
| 64 |
+
|
| 65 |
+
class ModelScopeQuotaManager:
|
| 66 |
+
def __init__(self):
|
| 67 |
+
self.lock = threading.Lock()
|
| 68 |
+
self.file_path = Path("/app/.translate_cache/modelscope_quota.json")
|
| 69 |
+
self.file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 70 |
+
self._load()
|
| 71 |
+
|
| 72 |
+
def _load(self):
|
| 73 |
+
if self.file_path.exists():
|
| 74 |
+
try:
|
| 75 |
+
self.data = json.loads(self.file_path.read_text(encoding="utf-8"))
|
| 76 |
+
except Exception:
|
| 77 |
+
self.data = {}
|
| 78 |
+
else:
|
| 79 |
+
self.data = {}
|
| 80 |
+
|
| 81 |
+
def _save(self):
|
| 82 |
+
try:
|
| 83 |
+
self.file_path.write_text(json.dumps(self.data, ensure_ascii=False), encoding="utf-8")
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"[quota] 保存配额记录失败: {e}")
|
| 86 |
+
|
| 87 |
+
def increment(self, model_name: str) -> bool:
|
| 88 |
+
with self.lock:
|
| 89 |
+
self._load()
|
| 90 |
+
today = time.strftime("%Y-%m-%d", time.localtime())
|
| 91 |
+
|
| 92 |
+
if self.data.get("date") != today:
|
| 93 |
+
self.data = {"date": today, "total": 0, "usage": {}}
|
| 94 |
+
|
| 95 |
+
current_usage = self.data["usage"].get(model_name, 0)
|
| 96 |
+
if self.data["total"] >= 1000:
|
| 97 |
+
print("[quota] ModelScope 达到单日总上限 1000 次")
|
| 98 |
+
return False
|
| 99 |
+
if current_usage >= 200:
|
| 100 |
+
print(f"[quota] ModelScope 模型 {model_name} 达到单日限制 200 次")
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
self.data["usage"][model_name] = current_usage + 1
|
| 104 |
+
self.data["total"] += 1
|
| 105 |
+
self._save()
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
_quota_manager = ModelScopeQuotaManager()
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
@app.on_event("startup")
|
| 112 |
+
async def startup():
|
| 113 |
+
global _ready
|
| 114 |
+
_ready = True
|
| 115 |
+
print("[startup] OpenWolf Spaces ready")
|
| 116 |
+
|
| 117 |
+
def _ensure_models():
|
| 118 |
+
_models_dir = Path("/app/models")
|
| 119 |
+
_bge_dir = _models_dir / "bge-m3"
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
if not (_bge_dir / "config.json").exists():
|
| 123 |
+
print("[models] Downloading bge-m3 (2.2GB)...")
|
| 124 |
+
t0 = time.time()
|
| 125 |
+
from sentence_transformers import SentenceTransformer
|
| 126 |
+
_ = SentenceTransformer("BAAI/bge-m3", device="cpu")
|
| 127 |
+
print(f"[models] bge-m3 done in {time.time()-t0:.1f}s")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"[models] bge-m3 download failed: {e}")
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
_gguf_files = [
|
| 133 |
+
("HY-MT1.5-1.8B-Q4_K_M.gguf", 1.13),
|
| 134 |
+
("HY-MT1.5-1.8B-Q8_0.gguf", 1.91),
|
| 135 |
+
]
|
| 136 |
+
_gguf_to_download = None
|
| 137 |
+
for _name, _gb in _gguf_files:
|
| 138 |
+
_p = _models_dir / "translate" / _name
|
| 139 |
+
if _p.exists():
|
| 140 |
+
_gguf_to_download = None
|
| 141 |
+
break
|
| 142 |
+
if _gguf_to_download is None:
|
| 143 |
+
_gguf_to_download = (_name, _gb)
|
| 144 |
+
if _gguf_to_download:
|
| 145 |
+
_name, _gb = _gguf_to_download
|
| 146 |
+
print(f"[models] Downloading {_name} ({_gb}GB)...")
|
| 147 |
+
t0 = time.time()
|
| 148 |
+
from huggingface_hub import hf_hub_download
|
| 149 |
+
hf_hub_download(
|
| 150 |
+
repo_id="tencent/HY-MT1.5-1.8B-GGUF",
|
| 151 |
+
filename=_name,
|
| 152 |
+
local_dir=str(_models_dir / "translate"),
|
| 153 |
+
)
|
| 154 |
+
print(f"[models] GGUF done in {time.time()-t0:.1f}s")
|
| 155 |
+
except Exception as e:
|
| 156 |
+
print(f"[models] GGUF download failed: {e}")
|
| 157 |
+
|
| 158 |
+
print("[models] All models ready")
|
| 159 |
+
try:
|
| 160 |
+
global _model_loaded, _model_loading
|
| 161 |
+
print("[warmup] Loading bge-m3...")
|
| 162 |
+
t0 = time.time()
|
| 163 |
+
from sentence_transformers import SentenceTransformer
|
| 164 |
+
_ = SentenceTransformer("BAAI/bge-m3", device="cpu")
|
| 165 |
+
_model_loaded = True
|
| 166 |
+
_model_loading = False
|
| 167 |
+
print(f"[warmup] bge-m3 loaded in {time.time()-t0:.1f}s")
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"[warmup] bge-m3 FAILED: {e}")
|
| 170 |
+
_model_loading = False
|
| 171 |
+
|
| 172 |
+
threading.Thread(target=_ensure_models, daemon=True).start()
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ══════════════════════════════════════════════════════════════════
|
| 176 |
+
# 工具函数群:动态密钥加载与语系检测
|
| 177 |
+
# ══════════════════════════════════════════════════════════════════
|
| 178 |
+
|
| 179 |
+
def get_multi_api_keys(prefix: str) -> list:
|
| 180 |
+
"""自动加载多账户 Key"""
|
| 181 |
+
keys = []
|
| 182 |
+
for i in range(1, 10):
|
| 183 |
+
val = os.environ.get(f"{prefix}_{i}") or os.environ.get(f"{prefix}{i}")
|
| 184 |
+
if val:
|
| 185 |
+
keys.append(val.strip())
|
| 186 |
+
single = os.environ.get(prefix)
|
| 187 |
+
if single and single.strip() not in keys:
|
| 188 |
+
keys.append(single.strip())
|
| 189 |
+
return keys
|
| 190 |
+
|
| 191 |
+
def detect_japanese_korean(text: str) -> str:
|
| 192 |
+
"""检测日韩语系特征字符"""
|
| 193 |
+
if re.search(r"[-ゟ゠-ヿ]", text):
|
| 194 |
+
return "ja"
|
| 195 |
+
if re.search(r"[가-]", text):
|
| 196 |
+
return "ko"
|
| 197 |
+
return "en"
|
| 198 |
+
|
| 199 |
+
def semantic_split(text: str, target_chars: int = 16000) -> list:
|
| 200 |
+
"""按段落逻辑切分,目标每段约 3000 英文词(≈16000 字符,译出约 5000 汉字)"""
|
| 201 |
+
paragraphs = text.split("\n")
|
| 202 |
+
chunks = []
|
| 203 |
+
current_chunk = []
|
| 204 |
+
current_size = 0
|
| 205 |
+
|
| 206 |
+
for para in paragraphs:
|
| 207 |
+
para_clean = para.strip()
|
| 208 |
+
if not para_clean:
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
if para_clean.lower().startswith(("references", "bibliography", "literature cited")) or (len(para_clean) < 40 and para_clean.lower() == "references"):
|
| 212 |
+
print("[split] 识别到参考文献标记,已略过后续内容")
|
| 213 |
+
break
|
| 214 |
+
|
| 215 |
+
if len(para_clean) < 80 and any(kw in para_clean.lower() for kw in ["page", "vol.", "no.", "issn", "doi:", "http://", "https://"]):
|
| 216 |
+
continue
|
| 217 |
+
|
| 218 |
+
para_size = len(para_clean)
|
| 219 |
+
if current_size + para_size > target_chars and current_chunk:
|
| 220 |
+
chunks.append("\n\n".join(current_chunk))
|
| 221 |
+
current_chunk = [para_clean]
|
| 222 |
+
current_size = para_size
|
| 223 |
+
else:
|
| 224 |
+
current_chunk.append(para_clean)
|
| 225 |
+
current_size += para_size + 2
|
| 226 |
+
|
| 227 |
+
if current_chunk:
|
| 228 |
+
chunks.append("\n\n".join(current_chunk))
|
| 229 |
+
|
| 230 |
+
return chunks if chunks else [text]
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# ══════════════════════════════════════════════════════════════════
|
| 234 |
+
# 在线 OCR 模块(硅基流动视觉大模型 Lazy 加载版)
|
| 235 |
+
# ══════════════════════════════════════════════════════════════════
|
| 236 |
+
|
| 237 |
+
def _ocr_page_via_siliconflow(pdf_path: Path, page_index: int) -> str:
|
| 238 |
+
"""利用 PyMuPDF 渲染 PDF 页面为 JPEG 图像,并发送给硅基流动执行高精度 OCR"""
|
| 239 |
+
sf_key = os.environ.get("SILICONFLOW_API_KEY")
|
| 240 |
+
if not sf_key:
|
| 241 |
+
print("[ocr] 未配置 SILICONFLOW_API_KEY,无法使用在线 OCR")
|
| 242 |
+
return ""
|
| 243 |
+
|
| 244 |
+
try:
|
| 245 |
+
import fitz
|
| 246 |
+
doc = fitz.open(pdf_path)
|
| 247 |
+
if page_index >= len(doc):
|
| 248 |
+
return ""
|
| 249 |
+
page = doc[page_index]
|
| 250 |
+
|
| 251 |
+
pix = page.get_pixmap(dpi=150)
|
| 252 |
+
img_bytes = pix.tobytes("jpg")
|
| 253 |
+
b64_img = base64.b64encode(img_bytes).decode("utf-8")
|
| 254 |
+
|
| 255 |
+
ocr_model = os.environ.get("OCRAI_OCR_MODEL", "PaddlePaddle/PaddleOCR-VL-1.5")
|
| 256 |
+
url = "https://api.siliconflow.cn/v1/chat/completions"
|
| 257 |
+
headers = {"Authorization": f"Bearer {sf_key.strip()}", "Content-Type": "application/json"}
|
| 258 |
+
body = {
|
| 259 |
+
"model": ocr_model,
|
| 260 |
+
"messages": [
|
| 261 |
+
{
|
| 262 |
+
"role": "user",
|
| 263 |
+
"content": [
|
| 264 |
+
{"type": "text", "text": "Extract all academic text in this image precisely. Keep formatting and paragraphs. Do not summarize."},
|
| 265 |
+
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"}}
|
| 266 |
+
]
|
| 267 |
+
}
|
| 268 |
+
],
|
| 269 |
+
"temperature": 0.1
|
| 270 |
+
}
|
| 271 |
+
r = requests.post(url, headers=headers, json=body, timeout=40)
|
| 272 |
+
if r.status_code == 200:
|
| 273 |
+
extracted_text = r.json()["choices"][0]["message"]["content"].strip()
|
| 274 |
+
print(f"[ocr] 页面 {page_index+1} OCR 成功 ({len(extracted_text)} 字符)")
|
| 275 |
+
return extracted_text
|
| 276 |
+
else:
|
| 277 |
+
print(f"[ocr] OCR HTTP {r.status_code}: {r.text[:150]}")
|
| 278 |
+
except Exception as e:
|
| 279 |
+
print(f"[ocr] 页面 {page_index+1} OCR 失败: {e}")
|
| 280 |
+
return ""
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# ══════════════════════════════════════════════════════════════════
|
| 284 |
+
# 极速引擎:云端 API 5层混合翻译链
|
| 285 |
+
# ══════════════════════════════════════════════════════════════════
|
| 286 |
+
|
| 287 |
+
def _strip_think(text: str) -> str:
|
| 288 |
+
"""去除模型输出的思考链标签及末尾附带的原文"""
|
| 289 |
+
import re
|
| 290 |
+
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL).strip()
|
| 291 |
+
# 如果译文结束后出现大段英文原文(译文长度一半以上),截断
|
| 292 |
+
lines = text.split('\n')
|
| 293 |
+
cleaned = []
|
| 294 |
+
for line in lines:
|
| 295 |
+
eng_ratio = sum(1 for c in line if c.isascii() and c.isalpha()) / max(len(line), 1)
|
| 296 |
+
if eng_ratio > 0.8 and len(line) > 60:
|
| 297 |
+
break
|
| 298 |
+
cleaned.append(line)
|
| 299 |
+
return '\n'.join(cleaned).strip()
|
| 300 |
+
|
| 301 |
+
def _call_api(url: str, headers: dict, body: dict, timeout: int = 12) -> str:
|
| 302 |
+
"""统一 API 调用,返回翻译文本或空字符串"""
|
| 303 |
+
try:
|
| 304 |
+
r = requests.post(url, headers=headers, json=body, timeout=timeout)
|
| 305 |
+
if r.status_code == 200:
|
| 306 |
+
result = r.json()["choices"][0]["message"]["content"].strip()
|
| 307 |
+
return _strip_think(result)
|
| 308 |
+
except Exception as e:
|
| 309 |
+
print(f"[translate] API 异常: {e}")
|
| 310 |
+
return ""
|
| 311 |
+
|
| 312 |
+
def _reorder_qwen_first(models: list, is_jk: bool) -> list:
|
| 313 |
+
"""日韩语系将 Qwen 模型提到队列前端"""
|
| 314 |
+
if not is_jk:
|
| 315 |
+
return models
|
| 316 |
+
return [m for m in models if "qwen" in m.lower()] + [m for m in models if "qwen" not in m.lower()]
|
| 317 |
+
|
| 318 |
+
def _translate_via_cloud_router(text: str, prev_source: str = "", prev_trans: str = "") -> str:
|
| 319 |
+
"""5 层在线翻译 API 路由调度器"""
|
| 320 |
+
lang = detect_japanese_korean(text)
|
| 321 |
+
is_jk = lang in ("ja", "ko")
|
| 322 |
+
|
| 323 |
+
context_prompt = ""
|
| 324 |
+
if prev_source and prev_trans:
|
| 325 |
+
context_prompt = (
|
| 326 |
+
f"### 上文翻译参考:\n"
|
| 327 |
+
f"【原文】:{prev_source[-200:]}\n"
|
| 328 |
+
f"【译文】:{prev_trans[-200:]}\n\n"
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
system_prompt = (
|
| 332 |
+
"你是一位精通多国语言的资深学术翻译专家。请将下面的英文学术文本翻译成中文。\n"
|
| 333 |
+
"## 翻译规则:\n"
|
| 334 |
+
"1. 保持专业学术语言风格,用词准确,翻译自然流畅。\n"
|
| 335 |
+
"2. 专有名词首次出现时请保留英文原文,格式如:“卷积神经网络 (Convolutional Neural Network, CNN)”。\n"
|
| 336 |
+
"3. 人名、地名首次出现时使用中英对照。\n"
|
| 337 |
+
"4. 严格保持原文段落和标点符号结构的完整,保留代码、公式、数字、年份。\n"
|
| 338 |
+
"5. 只输出中文译文,绝对不要包含原文。译文结束后立即结束,不要追加任何原文。\n"
|
| 339 |
+
"6. 不要在译文前加任何导语、标题、说明或介绍。直接从正文开始。\n"
|
| 340 |
+
"7. 不要输出 Markdown 标记、代码块或多余的空行。\n"
|
| 341 |
+
"8. 禁止输出思考过程。不要使用<think>标签或任何其他格式输出推理过程。"
|
| 342 |
+
)
|
| 343 |
+
user_content = f"{context_prompt}## 待翻译文本:\n{text}"
|
| 344 |
+
|
| 345 |
+
# ────── 第一层:ModelScope(限额轮询) ──────
|
| 346 |
+
ms_key = os.environ.get("MODELSCOPE_API_KEY")
|
| 347 |
+
if ms_key:
|
| 348 |
+
ms_models = [
|
| 349 |
+
"Qwen/Qwen3.5-397B-A17B",
|
| 350 |
+
"Qwen/Qwen3-235B-A22B-Thinking-2507",
|
| 351 |
+
"deepseek-ai/DeepSeek-V4-Pro",
|
| 352 |
+
"ZhipuAI/GLM-5.1",
|
| 353 |
+
"deepseek-ai/DeepSeek-V3.2",
|
| 354 |
+
"Qwen/Qwen3.5-122B-A10B",
|
| 355 |
+
"MiniMax/MiniMax-M1-80k",
|
| 356 |
+
"deepseek-ai/DeepSeek-R1-0528",
|
| 357 |
+
"ZhipuAI/GLM-5",
|
| 358 |
+
]
|
| 359 |
+
ms_models = _reorder_qwen_first(ms_models, is_jk)
|
| 360 |
+
for model in ms_models:
|
| 361 |
+
if not _quota_manager.increment(model):
|
| 362 |
+
continue
|
| 363 |
+
url = "https://api-inference.modelscope.cn/v1/chat/completions"
|
| 364 |
+
headers = {"Authorization": f"Bearer {ms_key.strip()}", "Content-Type": "application/json"}
|
| 365 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 366 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 367 |
+
if result:
|
| 368 |
+
print(f"[translate] 第一层 ModelScope {model} 成功")
|
| 369 |
+
return result
|
| 370 |
+
|
| 371 |
+
# ────── 第二层:主力层(Cerebras & Groq 多密钥轮询) ──────
|
| 372 |
+
cerebras_keys = get_multi_api_keys("CEREBRAS_API_KEY")
|
| 373 |
+
groq_keys = get_multi_api_keys("GROQ_API_KEY")
|
| 374 |
+
random.shuffle(cerebras_keys)
|
| 375 |
+
random.shuffle(groq_keys)
|
| 376 |
+
|
| 377 |
+
for key in cerebras_keys:
|
| 378 |
+
for model in ["gpt-oss-120b", "zai-glm-4.7"]:
|
| 379 |
+
url = "https://api.cerebras.ai/v1/chat/completions"
|
| 380 |
+
headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"}
|
| 381 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 382 |
+
result = _call_api(url, headers, body, timeout=10)
|
| 383 |
+
if result:
|
| 384 |
+
print(f"[translate] 第二层 Cerebras {model} 成功")
|
| 385 |
+
return result
|
| 386 |
+
|
| 387 |
+
for key in groq_keys:
|
| 388 |
+
for model in ["openai/gpt-oss-120b", "llama-3.3-70b-versatile"]:
|
| 389 |
+
url = "https://api.groq.com/openai/v1/chat/completions"
|
| 390 |
+
headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"}
|
| 391 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 392 |
+
result = _call_api(url, headers, body, timeout=10)
|
| 393 |
+
if result:
|
| 394 |
+
print(f"[translate] 第二层 Groq {model} 成功")
|
| 395 |
+
return result
|
| 396 |
+
|
| 397 |
+
# ────── 第三层:免费补充层 ──────
|
| 398 |
+
or_key = os.environ.get("OPENROUTER_API_KEY")
|
| 399 |
+
if or_key:
|
| 400 |
+
or_models = [
|
| 401 |
+
"qwen/qwen3-coder:free", "meta-llama/llama-3.3-70b-instruct:free",
|
| 402 |
+
"z-ai/glm-4.5-air:free", "nvidia/nemotron-3-super-120b-a12b:free",
|
| 403 |
+
"qwen/qwen3-next-80b-a3b-instruct:free"
|
| 404 |
+
]
|
| 405 |
+
or_models = _reorder_qwen_first(or_models, is_jk)
|
| 406 |
+
for model in or_models:
|
| 407 |
+
url = "https://openrouter.ai/api/v1/chat/completions"
|
| 408 |
+
headers = {"Authorization": f"Bearer {or_key.strip()}", "Content-Type": "application/json"}
|
| 409 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 410 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 411 |
+
if result:
|
| 412 |
+
print(f"[translate] 第三层 OpenRouter {model} 成功")
|
| 413 |
+
return result
|
| 414 |
+
|
| 415 |
+
sf_key = os.environ.get("SILICONFLOW_API_KEY")
|
| 416 |
+
if sf_key:
|
| 417 |
+
url = "https://api.siliconflow.cn/v1/chat/completions"
|
| 418 |
+
headers = {"Authorization": f"Bearer {sf_key.strip()}", "Content-Type": "application/json"}
|
| 419 |
+
body = {"model": "tencent/Hunyuan-MT-7B", "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 420 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 421 |
+
if result:
|
| 422 |
+
print("[translate] 第三层 Hunyuan-MT-7B 成功")
|
| 423 |
+
return result
|
| 424 |
+
|
| 425 |
+
nv_key = os.environ.get("NVIDIA_API_KEY")
|
| 426 |
+
if nv_key:
|
| 427 |
+
nv_models = [
|
| 428 |
+
"qwen/qwen3.5-397b-a17b", "qwen/qwen3-coder-480b-a35b-instruct",
|
| 429 |
+
"qwen/qwen3.5-122b-a10b", "z-ai/glm-5.1",
|
| 430 |
+
"nvidia/nemotron-3-super-120b-a12b"
|
| 431 |
+
]
|
| 432 |
+
nv_models = _reorder_qwen_first(nv_models, is_jk)
|
| 433 |
+
for model in nv_models:
|
| 434 |
+
url = "https://integrate.api.nvidia.com/v1/chat/completions"
|
| 435 |
+
headers = {"Authorization": f"Bearer {nv_key.strip()}", "Content-Type": "application/json"}
|
| 436 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2, "max_tokens": 4096}
|
| 437 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 438 |
+
if result:
|
| 439 |
+
print(f"[translate] 第三层 NVIDIA {model} 成功")
|
| 440 |
+
return result
|
| 441 |
+
|
| 442 |
+
# ────── 第四层:主力辅助层 ──────
|
| 443 |
+
mistral_keys = get_multi_api_keys("MISTRAL_API_KEY")
|
| 444 |
+
opencode_keys = get_multi_api_keys("OPENCODE_API_KEY")
|
| 445 |
+
random.shuffle(mistral_keys)
|
| 446 |
+
random.shuffle(opencode_keys)
|
| 447 |
+
|
| 448 |
+
for key in mistral_keys:
|
| 449 |
+
for model in ["mistral-large-latest", "mistral-medium-latest"]:
|
| 450 |
+
url = "https://api.mistral.ai/v1/chat/completions"
|
| 451 |
+
headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"}
|
| 452 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 453 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 454 |
+
if result:
|
| 455 |
+
print(f"[translate] 第四层 Mistral {model} 成功")
|
| 456 |
+
return result
|
| 457 |
+
|
| 458 |
+
for key in opencode_keys:
|
| 459 |
+
for model in ["big-pickle", "nemotron-3-super-free", "deepseek-v4-flash-free"]:
|
| 460 |
+
url = "https://opencode.ai/zen/v1/chat/completions"
|
| 461 |
+
headers = {"Authorization": f"Bearer {key}", "Content-Type": "application/json"}
|
| 462 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 463 |
+
result = _call_api(url, headers, body, timeout=12)
|
| 464 |
+
if result:
|
| 465 |
+
print(f"[translate] 第四层 opencode {model} 成功")
|
| 466 |
+
return result
|
| 467 |
+
|
| 468 |
+
# ────── 第五层:轻量兜底层 ──────
|
| 469 |
+
zp_key = os.environ.get("ZHIPUAI_API_KEY")
|
| 470 |
+
if zp_key:
|
| 471 |
+
for model in ["glm-4.7-flash", "glm-4.6-flash", "GLM-Z1-Flash", "GLM-4-Flash"]:
|
| 472 |
+
url = "https://open.bigmodel.cn/api/paas/v4/chat/completions"
|
| 473 |
+
headers = {"Authorization": f"Bearer {zp_key.strip()}", "Content-Type": "application/json"}
|
| 474 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 475 |
+
result = _call_api(url, headers, body, timeout=10)
|
| 476 |
+
if result:
|
| 477 |
+
print(f"[translate] 第五层 智谱 {model} 成功")
|
| 478 |
+
return result
|
| 479 |
+
|
| 480 |
+
if sf_key:
|
| 481 |
+
sf_free = [
|
| 482 |
+
"deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", "Qwen/Qwen3.5-4B",
|
| 483 |
+
"Qwen/Qwen3-8B", "THUDM/GLM-Z1-9B-0414", "THUDM/GLM-4-9B-0414"
|
| 484 |
+
]
|
| 485 |
+
sf_free = _reorder_qwen_first(sf_free, is_jk)
|
| 486 |
+
for model in sf_free:
|
| 487 |
+
url = "https://api.siliconflow.cn/v1/chat/completions"
|
| 488 |
+
headers = {"Authorization": f"Bearer {sf_key.strip()}", "Content-Type": "application/json"}
|
| 489 |
+
body = {"model": model, "messages": [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}], "temperature": 0.2}
|
| 490 |
+
result = _call_api(url, headers, body, timeout=10)
|
| 491 |
+
if result:
|
| 492 |
+
print(f"[translate] 第五层 硅基 {model} 成功")
|
| 493 |
+
return result
|
| 494 |
+
|
| 495 |
+
return ""
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
# ══════════════════════════════════════════════════════════════════
|
| 499 |
+
# 本地 GGUF 离线兜底引擎
|
| 500 |
+
# ══════════════════════════════════════════════════════════════════
|
| 501 |
+
|
| 502 |
+
def _get_llama():
|
| 503 |
+
global _llama_model
|
| 504 |
+
if _llama_model is not None:
|
| 505 |
+
return _llama_model
|
| 506 |
+
with _llama_lock:
|
| 507 |
+
if _llama_model is not None:
|
| 508 |
+
return _llama_model
|
| 509 |
+
|
| 510 |
+
model_path = "/app/models/translate/HY-MT1.5-1.8B-Q4_K_M.gguf"
|
| 511 |
+
|
| 512 |
+
for _i in range(60):
|
| 513 |
+
if os.path.isfile(model_path) and os.path.getsize(model_path) > 100 * 1024 * 1024:
|
| 514 |
+
break
|
| 515 |
+
print(f"[llama] 等待本地兜底模型准备就绪... {_i}s")
|
| 516 |
+
time.sleep(1)
|
| 517 |
+
|
| 518 |
+
if not os.path.isfile(model_path):
|
| 519 |
+
model_path = "/app/models/translate/HY-MT1.5-1.8B-Q8_0.gguf"
|
| 520 |
+
for _i in range(10):
|
| 521 |
+
if os.path.isfile(model_path) and os.path.getsize(model_path) > 100 * 1024 * 1024:
|
| 522 |
+
break
|
| 523 |
+
time.sleep(1)
|
| 524 |
+
|
| 525 |
+
if not os.path.isfile(model_path):
|
| 526 |
+
raise FileNotFoundError("GGUF model file not found")
|
| 527 |
+
|
| 528 |
+
from llama_cpp import Llama
|
| 529 |
+
|
| 530 |
+
result = [None]
|
| 531 |
+
error = [None]
|
| 532 |
+
done = threading.Event()
|
| 533 |
+
|
| 534 |
+
num_cores = os.cpu_count() or 4
|
| 535 |
+
optimal_threads = max(1, min(4, num_cores))
|
| 536 |
+
|
| 537 |
+
def _load():
|
| 538 |
+
try:
|
| 539 |
+
print(f"[llama] Loading HY-MT1.5 with {optimal_threads} threads...")
|
| 540 |
+
t0 = time.time()
|
| 541 |
+
result[0] = Llama(model_path=model_path, n_ctx=2048,
|
| 542 |
+
n_threads=optimal_threads, n_gpu_layers=0, verbose=False,
|
| 543 |
+
use_mmap=True, use_mlock=False)
|
| 544 |
+
print(f"[llama] Loaded in {time.time()-t0:.1f}s")
|
| 545 |
+
except Exception as e:
|
| 546 |
+
error[0] = e
|
| 547 |
+
finally:
|
| 548 |
+
done.set()
|
| 549 |
+
|
| 550 |
+
t = threading.Thread(target=_load, daemon=True)
|
| 551 |
+
t.start()
|
| 552 |
+
if not done.wait(timeout=90):
|
| 553 |
+
raise TimeoutError("GGUF model loading timed out (90s)")
|
| 554 |
+
if error[0]:
|
| 555 |
+
raise error[0]
|
| 556 |
+
_llama_model = result[0]
|
| 557 |
+
return _llama_model
|
| 558 |
+
|
| 559 |
+
|
| 560 |
+
def _translate_chunk_local_with_context(text: str, prev_source: str = "", prev_trans: str = "") -> str:
|
| 561 |
+
llm = _get_llama()
|
| 562 |
+
_max = max(128, min(1024, int(len(text) * 1.5)))
|
| 563 |
+
|
| 564 |
+
context_prompt = ""
|
| 565 |
+
if prev_source and prev_trans:
|
| 566 |
+
context_prompt = (
|
| 567 |
+
f"### 上文翻译参考:\n"
|
| 568 |
+
f"【上文原文】:{prev_source[-200:]}\n"
|
| 569 |
+
f"【上文译文】:{prev_trans[-200:]}\n\n"
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
_prompt = (
|
| 573 |
+
"你是一位精通多国语言的资深学术翻译专家。请将下面的英文学术文本翻译成中文。\n"
|
| 574 |
+
"## 翻译规则:\n"
|
| 575 |
+
"1. 保持专业学术语言风格,用词准确,翻译自然流畅。\n"
|
| 576 |
+
"2. 专有名词首次出现时请保留英文原文。\n"
|
| 577 |
+
"3. 人名、地名首次出现时使用中英对照。\n"
|
| 578 |
+
"4. 严格保持原文段落结构,保留代码、公式、数字、年份。\n"
|
| 579 |
+
"5. 只输出中文译文,绝对不要包含原文。\n"
|
| 580 |
+
"6. 不要在译文前加任何导语、标题、说明。直接从正文开始。\n"
|
| 581 |
+
"7. 不要输出 Markdown 标记、代码块或多余的空行。\n\n"
|
| 582 |
+
f"{context_prompt}"
|
| 583 |
+
"## 待翻译文本:\n"
|
| 584 |
+
f"{text}"
|
| 585 |
+
)
|
| 586 |
+
with _infer_lock:
|
| 587 |
+
out = llm.create_chat_completion(
|
| 588 |
+
messages=[{"role": "user", "content": _prompt}],
|
| 589 |
+
max_tokens=_max, temperature=0.1,
|
| 590 |
+
)
|
| 591 |
+
return out["choices"][0]["message"]["content"].strip()
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
# ══════════════════════════════════════════════════════════════════
|
| 595 |
+
# 防弹版异步翻译任务启动(含懒加载 OCR + 5层路由 + GGUF兜底)
|
| 596 |
+
# ══════════════════════════════════════════════════════════════════
|
| 597 |
+
|
| 598 |
+
@app.post("/api/job/start")
|
| 599 |
+
async def api_job_start(request: Request):
|
| 600 |
+
try:
|
| 601 |
+
body = await request.json()
|
| 602 |
+
except Exception as e:
|
| 603 |
+
return {"ok": False, "error": f"JSON 解析失败: {e}"}
|
| 604 |
+
|
| 605 |
+
try:
|
| 606 |
+
task_id = str(uuid.uuid4())
|
| 607 |
+
_translate_tasks[task_id] = {"status": "processing", "result": None}
|
| 608 |
+
|
| 609 |
+
def _do_work(t_id, payload):
|
| 610 |
+
try:
|
| 611 |
+
_file_id = payload.get("fileId") or payload.get("file_id")
|
| 612 |
+
_file_path = payload.get("file_path") or payload.get("filePath")
|
| 613 |
+
_dl_url = payload.get("download_url") or payload.get("downloadUrl")
|
| 614 |
+
_r2_url = payload.get("r2_download_url") or payload.get("r2DownloadUrl")
|
| 615 |
+
_orig_fn = payload.get("fileName") or payload.get("filename") or "document.pdf"
|
| 616 |
+
_orig_ext = _orig_fn.rsplit(".", 1)[-1].lower() if "." in _orig_fn else "pdf"
|
| 617 |
+
|
| 618 |
+
_ci = payload.get("chunk_index", 0)
|
| 619 |
+
if _ci == -1 or _ci is None:
|
| 620 |
+
_ci = 0
|
| 621 |
+
|
| 622 |
+
_chat_id_str = str(payload.get("chat_id") or "default")
|
| 623 |
+
_context_dir = Path("/app/.context_cache") / _chat_id_str
|
| 624 |
+
_context_dir.mkdir(parents=True, exist_ok=True)
|
| 625 |
+
|
| 626 |
+
_chunks_cache_file = _context_dir / "chunks_list.json"
|
| 627 |
+
_meta_cache_file = _context_dir / "pdf_metadata.json"
|
| 628 |
+
_chunks = []
|
| 629 |
+
|
| 630 |
+
# 读取元数据
|
| 631 |
+
_is_scanned = False
|
| 632 |
+
_total_pages = 0
|
| 633 |
+
if _meta_cache_file.is_file():
|
| 634 |
+
try:
|
| 635 |
+
_meta = json.loads(_meta_cache_file.read_text(encoding="utf-8"))
|
| 636 |
+
_is_scanned = _meta.get("is_scanned", False)
|
| 637 |
+
_total_pages = _meta.get("total_pages", 0)
|
| 638 |
+
except Exception:
|
| 639 |
+
pass
|
| 640 |
+
|
| 641 |
+
# 非首段且已缓存 → 直接加载
|
| 642 |
+
if _ci > 0 and not _is_scanned and _chunks_cache_file.is_file():
|
| 643 |
+
try:
|
| 644 |
+
_chunks = json.loads(_chunks_cache_file.read_text(encoding="utf-8"))
|
| 645 |
+
print(f"[translate] 命中缓存: {_ci + 1}/{len(_chunks)}")
|
| 646 |
+
except Exception:
|
| 647 |
+
pass
|
| 648 |
+
|
| 649 |
+
# 首段或无缓存 → 下载 + 检测 PDF 属性
|
| 650 |
+
if not _chunks and _ci == 0:
|
| 651 |
+
print(f"[download] 开始定位文件: file_id={_file_id}")
|
| 652 |
+
_downloaded_local_path = None
|
| 653 |
+
|
| 654 |
+
# R2
|
| 655 |
+
if _r2_url:
|
| 656 |
+
try:
|
| 657 |
+
import requests as _rt
|
| 658 |
+
import uuid as _uuid
|
| 659 |
+
_r = _rt.get(_r2_url, timeout=120, stream=True)
|
| 660 |
+
if _r.status_code == 200:
|
| 661 |
+
_local = Path("/app") / f"inputs/{_uuid.uuid4().hex}_{_orig_fn}"
|
| 662 |
+
_local.parent.mkdir(parents=True, exist_ok=True)
|
| 663 |
+
with open(_local, "wb") as _f:
|
| 664 |
+
for _chunk in _r.iter_content(chunk_size=65536):
|
| 665 |
+
_f.write(_chunk)
|
| 666 |
+
_downloaded_local_path = _local
|
| 667 |
+
except Exception as _e:
|
| 668 |
+
print(f"[download] R2 异常: {_e}")
|
| 669 |
+
|
| 670 |
+
# download_url
|
| 671 |
+
if _dl_url and not (_downloaded_local_path and _downloaded_local_path.is_file()):
|
| 672 |
+
try:
|
| 673 |
+
import requests as _rt
|
| 674 |
+
import uuid as _uuid
|
| 675 |
+
_h = {}
|
| 676 |
+
_gh_pat = os.environ.get("OPENWOLF_PAT") or os.environ.get("GITHUB_PAT") or os.environ.get("GITHUB_TOKEN") or ""
|
| 677 |
+
if "api.github.com" in _dl_url and _gh_pat:
|
| 678 |
+
_h["Authorization"] = f"Bearer {_gh_pat}"
|
| 679 |
+
_h["Accept"] = "application/vnd.github.raw"
|
| 680 |
+
_r = _rt.get(_dl_url, headers=_h, timeout=120)
|
| 681 |
+
if _r.status_code == 200:
|
| 682 |
+
_local = Path("/app") / f"inputs/{_uuid.uuid4().hex}.{_orig_ext}"
|
| 683 |
+
_local.parent.mkdir(parents=True, exist_ok=True)
|
| 684 |
+
_local.write_bytes(_r.content)
|
| 685 |
+
_downloaded_local_path = _local
|
| 686 |
+
except Exception as _e:
|
| 687 |
+
print(f"[download] download_url 异常: {_e}")
|
| 688 |
+
|
| 689 |
+
# GitHub API
|
| 690 |
+
if _file_path and not (_downloaded_local_path and _downloaded_local_path.is_file()):
|
| 691 |
+
_repo_path = _file_path
|
| 692 |
+
_local_check = Path("/app") / _repo_path
|
| 693 |
+
if _local_check.is_file():
|
| 694 |
+
_downloaded_local_path = _local_check
|
| 695 |
+
elif Path(_repo_path).is_file():
|
| 696 |
+
_downloaded_local_path = Path(_repo_path)
|
| 697 |
+
else:
|
| 698 |
+
_gh_repo = os.environ.get("GITHUB_REPO", "hughyonng/OpenWolf")
|
| 699 |
+
_gh_pat = os.environ.get("OPENWOLF_PAT") or os.environ.get("GITHUB_PAT") or os.environ.get("GITHUB_TOKEN") or ""
|
| 700 |
+
if _gh_pat:
|
| 701 |
+
try:
|
| 702 |
+
import requests as _rt
|
| 703 |
+
import uuid as _uuid
|
| 704 |
+
_u = f"https://api.github.com/repos/{_gh_repo}/contents/{_repo_path}"
|
| 705 |
+
_h = {"Authorization": f"Bearer {_gh_pat}", "Accept": "application/vnd.github.raw"}
|
| 706 |
+
_r = _rt.get(_u, headers=_h, timeout=120)
|
| 707 |
+
if _r.status_code == 200:
|
| 708 |
+
_local = Path("/app") / f"inputs/{_uuid.uuid4().hex}.{_orig_ext}"
|
| 709 |
+
_local.parent.mkdir(parents=True, exist_ok=True)
|
| 710 |
+
_local.write_bytes(_r.content)
|
| 711 |
+
_downloaded_local_path = _local
|
| 712 |
+
except Exception as _e:
|
| 713 |
+
print(f"[download] GitHub API 异常: {_e}")
|
| 714 |
+
|
| 715 |
+
# Telegram 兜底
|
| 716 |
+
if _file_id and not (_downloaded_local_path and _downloaded_local_path.is_file()):
|
| 717 |
+
try:
|
| 718 |
+
_token = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
| 719 |
+
if _token:
|
| 720 |
+
import requests as _rt
|
| 721 |
+
import uuid as _uuid
|
| 722 |
+
_mr = _rt.get(f"https://api.telegram.org/bot{_token}/getFile?file_id={_file_id}", timeout=30)
|
| 723 |
+
_fd = _mr.json().get("result", {}) if _mr.ok else {}
|
| 724 |
+
_fp = _fd.get("file_path", "")
|
| 725 |
+
if _mr.ok and _fp:
|
| 726 |
+
_dl = _rt.get(f"https://api.telegram.org/file/bot{_token}/{_fp}", timeout=300, stream=True)
|
| 727 |
+
if _dl.ok:
|
| 728 |
+
_local = Path("/app") / f"inputs/{_uuid.uuid4().hex}_{_fp.split('/')[-1]}"
|
| 729 |
+
_local.parent.mkdir(parents=True, exist_ok=True)
|
| 730 |
+
with open(_local, "wb") as _f:
|
| 731 |
+
for _chunk in _dl.iter_content(chunk_size=65536):
|
| 732 |
+
_f.write(_chunk)
|
| 733 |
+
_downloaded_local_path = _local
|
| 734 |
+
except Exception as _e:
|
| 735 |
+
print(f"[download] Telegram 异常: {_e}")
|
| 736 |
+
|
| 737 |
+
if not _downloaded_local_path or not _downloaded_local_path.is_file():
|
| 738 |
+
raise ValueError("无法在所有防护层中下载文档")
|
| 739 |
+
|
| 740 |
+
# 缓存到固定路径
|
| 741 |
+
_fixed_path = _context_dir / f"source_document.{_orig_ext}"
|
| 742 |
+
shutil.copy2(_downloaded_local_path, _fixed_path)
|
| 743 |
+
_downloaded_local_path = _fixed_path
|
| 744 |
+
|
| 745 |
+
# 判断 PDF 属性
|
| 746 |
+
if _orig_ext == "pdf":
|
| 747 |
+
try:
|
| 748 |
+
import fitz
|
| 749 |
+
doc = fitz.open(_downloaded_local_path)
|
| 750 |
+
_total_pages = len(doc)
|
| 751 |
+
sample_text = ""
|
| 752 |
+
for p_idx in range(min(3, _total_pages)):
|
| 753 |
+
sample_text += doc[p_idx].get_text() or ""
|
| 754 |
+
_is_scanned = len(sample_text.strip()) < 100
|
| 755 |
+
print(f"[translate] PDF 属性: {'扫描版' if _is_scanned else '电子版'}")
|
| 756 |
+
except Exception as e:
|
| 757 |
+
_is_scanned = False
|
| 758 |
+
print(f"[translate] PDF 属性检测异常: {e}")
|
| 759 |
+
else:
|
| 760 |
+
_is_scanned = False
|
| 761 |
+
|
| 762 |
+
try:
|
| 763 |
+
_meta_cache_file.write_text(json.dumps({
|
| 764 |
+
"is_scanned": _is_scanned, "total_pages": _total_pages,
|
| 765 |
+
"file_ext": _orig_ext, "file_name": _orig_fn,
|
| 766 |
+
}, ensure_ascii=False), encoding="utf-8")
|
| 767 |
+
except Exception as e:
|
| 768 |
+
print(f"[meta] 缓存异常: {e}")
|
| 769 |
+
|
| 770 |
+
# 电子版:全文提取 + 语义切分
|
| 771 |
+
if not _is_scanned:
|
| 772 |
+
import pdfplumber as _pp
|
| 773 |
+
_full_text = ""
|
| 774 |
+
if _orig_ext == "pdf":
|
| 775 |
+
with _pp.open(_downloaded_local_path) as _p:
|
| 776 |
+
_full_text = "\n".join(page.extract_text() or "" for page in _p.pages)
|
| 777 |
+
elif _orig_ext in ("txt", "md", "csv", "json"):
|
| 778 |
+
with open(_downloaded_local_path, "r", encoding="utf-8", errors="ignore") as _f:
|
| 779 |
+
_full_text = _f.read()
|
| 780 |
+
elif _orig_ext in ("docx",):
|
| 781 |
+
import docx as _dx
|
| 782 |
+
_d = _dx.Document(_downloaded_local_path)
|
| 783 |
+
_full_text = "\n".join(p.text for p in _d.paragraphs)
|
| 784 |
+
|
| 785 |
+
if not _full_text.strip():
|
| 786 |
+
raise ValueError("文本提取为空")
|
| 787 |
+
|
| 788 |
+
_chunks = semantic_split(_full_text, target_chars=16000)
|
| 789 |
+
try:
|
| 790 |
+
_chunks_cache_file.write_text(json.dumps(_chunks, ensure_ascii=False), encoding="utf-8")
|
| 791 |
+
print(f"[translate] 分段完成: {len(_chunks)} 段")
|
| 792 |
+
except Exception as _se:
|
| 793 |
+
print(f"[translate] 写入缓存失败: {_se}")
|
| 794 |
+
|
| 795 |
+
# ────── 执行翻译 ──────
|
| 796 |
+
if _is_scanned:
|
| 797 |
+
_total_chunks = int((_total_pages + PAGES_PER_CHUNK - 1) / PAGES_PER_CHUNK)
|
| 798 |
+
if _ci >= _total_chunks:
|
| 799 |
+
result_payload = {"translated_text": "🎉 本书已全部翻译完毕!", "has_more": False, "chunk_index": _ci, "total_chunks": _total_chunks}
|
| 800 |
+
else:
|
| 801 |
+
start_page = _ci * PAGES_PER_CHUNK
|
| 802 |
+
end_page = min(start_page + PAGES_PER_CHUNK, _total_pages)
|
| 803 |
+
_chunk_raw_text = ""
|
| 804 |
+
_fixed_path = _context_dir / f"source_document.{_orig_ext}"
|
| 805 |
+
print(f"[ocr] 提取第 {start_page+1}-{end_page} 页...")
|
| 806 |
+
for p_idx in range(start_page, end_page):
|
| 807 |
+
page_text = _ocr_page_via_siliconflow(_fixed_path, p_idx)
|
| 808 |
+
if page_text:
|
| 809 |
+
_chunk_raw_text += page_text + "\n\n"
|
| 810 |
+
if not _chunk_raw_text.strip():
|
| 811 |
+
raise ValueError(f"OCR 未在第 {start_page+1}-{end_page} 页识别到有效字符")
|
| 812 |
+
|
| 813 |
+
_prev_source, _prev_trans = _load_context(_context_dir, _ci)
|
| 814 |
+
_tr = _translate_via_cloud_router(_chunk_raw_text, _prev_source, _prev_trans)
|
| 815 |
+
if not _tr:
|
| 816 |
+
print("[translate] 在线路由失败,降级 GGUF")
|
| 817 |
+
_tr = _translate_chunk_local_with_context(_chunk_raw_text, _prev_source, _prev_trans)
|
| 818 |
+
_save_context(_context_dir, _ci, _chunk_raw_text, _tr)
|
| 819 |
+
_clean_old_context(_context_dir, _ci)
|
| 820 |
+
result_payload = {"translated_text": _tr, "has_more": (_ci + 1) < _total_chunks, "chunk_index": _ci, "total_chunks": _total_chunks}
|
| 821 |
+
else:
|
| 822 |
+
if not _chunks and _chunks_cache_file.is_file():
|
| 823 |
+
try:
|
| 824 |
+
_chunks = json.loads(_chunks_cache_file.read_text(encoding="utf-8"))
|
| 825 |
+
except Exception:
|
| 826 |
+
pass
|
| 827 |
+
_total_chunks = len(_chunks) if _chunks else 1
|
| 828 |
+
if _ci >= _total_chunks or not _chunks:
|
| 829 |
+
result_payload = {"translated_text": "🎉 本书已翻译完毕!", "has_more": False, "chunk_index": _ci, "total_chunks": _total_chunks}
|
| 830 |
+
else:
|
| 831 |
+
_chunk_to_trans = _chunks[_ci]
|
| 832 |
+
_prev_source, _prev_trans = _load_context(_context_dir, _ci)
|
| 833 |
+
_tr = _translate_via_cloud_router(_chunk_to_trans, _prev_source, _prev_trans)
|
| 834 |
+
if not _tr:
|
| 835 |
+
print("[translate] 在线路由失败,降级 GGUF")
|
| 836 |
+
_tr = _translate_chunk_local_with_context(_chunk_to_trans, _prev_source, _prev_trans)
|
| 837 |
+
_save_context(_context_dir, _ci, _chunk_to_trans, _tr)
|
| 838 |
+
_clean_old_context(_context_dir, _ci)
|
| 839 |
+
result_payload = {"translated_text": _tr, "has_more": (_ci + 1) < _total_chunks, "chunk_index": _ci, "total_chunks": _total_chunks}
|
| 840 |
+
|
| 841 |
+
_translate_tasks[t_id] = {"status": "done", "result": json.dumps(result_payload, ensure_ascii=False)}
|
| 842 |
+
except Exception as e:
|
| 843 |
+
import traceback
|
| 844 |
+
print(f"[api_job_start] 异常: {e}")
|
| 845 |
+
traceback.print_exc()
|
| 846 |
+
_translate_tasks[t_id] = {"status": "error", "result": json.dumps({"error": str(e)})}
|
| 847 |
+
|
| 848 |
+
_translate_pool.submit(_do_work, task_id, body)
|
| 849 |
+
return {"ok": True, "task_id": task_id}
|
| 850 |
+
except Exception as e:
|
| 851 |
+
return {"ok": False, "error": f"路由层报错: {e}"}
|
| 852 |
+
|
| 853 |
+
|
| 854 |
+
# ══════════════════════════════════════════════════════════════════
|
| 855 |
+
# 滑窗上下文管理
|
| 856 |
+
# ══════════════════════════════════════════════════════════════════
|
| 857 |
+
|
| 858 |
+
def _load_context(context_dir: Path, ci: int):
|
| 859 |
+
prev_source, prev_trans = "", ""
|
| 860 |
+
prev_src = context_dir / f"src_{ci - 1}.txt"
|
| 861 |
+
prev_trs = context_dir / f"trans_{ci - 1}.txt"
|
| 862 |
+
if ci > 0 and prev_src.is_file() and prev_trs.is_file():
|
| 863 |
+
prev_source = prev_src.read_text(encoding="utf-8", errors="ignore")
|
| 864 |
+
prev_trans = prev_trs.read_text(encoding="utf-8", errors="ignore")
|
| 865 |
+
return prev_source, prev_trans
|
| 866 |
+
|
| 867 |
+
def _save_context(context_dir: Path, ci: int, src: str, trans: str):
|
| 868 |
+
(context_dir / f"src_{ci}.txt").write_text(src, encoding="utf-8")
|
| 869 |
+
(context_dir / f"trans_{ci}.txt").write_text(trans, encoding="utf-8")
|
| 870 |
+
|
| 871 |
+
def _clean_old_context(context_dir: Path, ci: int):
|
| 872 |
+
for _f in context_dir.glob("*.txt"):
|
| 873 |
+
try:
|
| 874 |
+
_f_name = _f.name
|
| 875 |
+
if _f_name.startswith("src_") or _f_name.startswith("trans_"):
|
| 876 |
+
_f_idx = int(_f_name.split("_")[1].split(".")[0])
|
| 877 |
+
if _f_idx < ci - 1:
|
| 878 |
+
_f.unlink()
|
| 879 |
+
except:
|
| 880 |
+
pass
|
| 881 |
+
|
| 882 |
+
|
| 883 |
+
@app.get("/api/job/check/{task_id}")
|
| 884 |
+
async def api_job_check(task_id: str):
|
| 885 |
+
try:
|
| 886 |
+
task = _translate_tasks.get(task_id)
|
| 887 |
+
if not task:
|
| 888 |
+
return {"ok": False, "status": "error", "result": "任务ID不存在"}
|
| 889 |
+
if task["status"] in ("done", "error"):
|
| 890 |
+
result_copy = task.copy()
|
| 891 |
+
del _translate_tasks[task_id]
|
| 892 |
+
return {"ok": True, "status": result_copy["status"], "result": result_copy["result"]}
|
| 893 |
+
return {"ok": True, "status": "processing"}
|
| 894 |
+
except Exception as e:
|
| 895 |
+
return {"ok": False, "status": "error", "result": f"检查报错: {e}"}
|
| 896 |
+
|
| 897 |
+
|
| 898 |
+
@app.get("/debug/model")
|
| 899 |
+
async def debug_model():
|
| 900 |
+
q4_path = "/app/models/translate/HY-MT1.5-1.8B-Q4_K_M.gguf"
|
| 901 |
+
q8_path = "/app/models/translate/HY-MT1.5-1.8B-Q8_0.gguf"
|
| 902 |
+
q4_exists = os.path.isfile(q4_path)
|
| 903 |
+
q8_exists = os.path.isfile(q8_path)
|
| 904 |
+
result = {
|
| 905 |
+
"q4_exists": q4_exists, "q8_exists": q8_exists,
|
| 906 |
+
"q4_size_gb": round(os.path.getsize(q4_path) / 1024**3, 2) if q4_exists else 0,
|
| 907 |
+
"q8_size_gb": round(os.path.getsize(q8_path) / 1024**3, 2) if q8_exists else 0,
|
| 908 |
+
"llama_loaded": _llama_model is not None,
|
| 909 |
+
}
|
| 910 |
+
try:
|
| 911 |
+
import llama_cpp
|
| 912 |
+
result["llama_cpp_version"] = llama_cpp.__version__
|
| 913 |
+
except ImportError:
|
| 914 |
+
result["llama_cpp_version"] = None
|
| 915 |
+
return JSONResponse(result)
|
| 916 |
+
|
| 917 |
+
|
| 918 |
+
@app.post("/ping")
|
| 919 |
+
async def ping():
|
| 920 |
+
return {"ok": True, "msg": "pong"}
|
| 921 |
+
|
| 922 |
+
|
| 923 |
+
# ── 以下保留原有接口(analyze-doc, task, skill, health 等)──
|
| 924 |
+
|
| 925 |
+
@app.post("/analyze-doc")
|
| 926 |
+
async def analyze_doc(request: Request):
|
| 927 |
+
try:
|
| 928 |
+
body = await request.json()
|
| 929 |
+
except Exception:
|
| 930 |
+
raise HTTPException(status_code=400, detail="Invalid JSON")
|
| 931 |
+
url = body.get("url", "")
|
| 932 |
+
question = body.get("question", "请分析这份文档的内容")
|
| 933 |
+
max_chars = int(body.get("max_chars", 50000))
|
| 934 |
+
if not url:
|
| 935 |
+
return {"ok": False, "error": "url required"}
|
| 936 |
+
import requests as _req
|
| 937 |
+
import uuid as _uuid
|
| 938 |
+
from pathlib import Path
|
| 939 |
+
resp = _req.get(url, timeout=300, stream=True)
|
| 940 |
+
if resp.status_code != 200:
|
| 941 |
+
return {"ok": False, "error": f"下载失败 HTTP {resp.status_code}"}
|
| 942 |
+
local_path = Path("/app") / f"inputs/{_uuid.uuid4().hex}.pdf"
|
| 943 |
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 944 |
+
with open(local_path, "wb") as f:
|
| 945 |
+
for chunk in resp.iter_content(chunk_size=65536):
|
| 946 |
+
f.write(chunk)
|
| 947 |
+
import pdfplumber
|
| 948 |
+
text = ""
|
| 949 |
+
with pdfplumber.open(local_path) as p:
|
| 950 |
+
for page in p.pages:
|
| 951 |
+
t = page.extract_text()
|
| 952 |
+
if t:
|
| 953 |
+
text += t + "\n"
|
| 954 |
+
try:
|
| 955 |
+
local_path.unlink()
|
| 956 |
+
except Exception:
|
| 957 |
+
pass
|
| 958 |
+
if not text.strip():
|
| 959 |
+
return {"ok": False, "error": "无法提取文本内容"}
|
| 960 |
+
doc_text = text[:max_chars]
|
| 961 |
+
return {"ok": True, "result": "分析完成"}
|
| 962 |
+
|
| 963 |
+
@app.post("/analyze-doc/start")
|
| 964 |
+
async def analyze_doc_start(request: Request):
|
| 965 |
+
try:
|
| 966 |
+
body = await request.json()
|
| 967 |
+
except Exception:
|
| 968 |
+
raise HTTPException(status_code=400, detail="Invalid JSON")
|
| 969 |
+
url = body.get("url", "")
|
| 970 |
+
question = body.get("question", "请分析这份文档的内容")
|
| 971 |
+
max_chars = int(body.get("max_chars", 50000))
|
| 972 |
+
if not url:
|
| 973 |
+
return {"ok": False, "error": "url required"}
|
| 974 |
+
task_id = str(uuid.uuid4())
|
| 975 |
+
_analyze_tasks[task_id] = {"status": "processing", "result": None}
|
| 976 |
+
_analyze_pool.submit(_do_analyze_async, task_id, url, question, max_chars)
|
| 977 |
+
return {"ok": True, "task_id": task_id}
|
| 978 |
+
|
| 979 |
+
@app.post("/analyze-text/start")
|
| 980 |
+
async def analyze_text_start(request: Request):
|
| 981 |
+
try:
|
| 982 |
+
body = await request.json()
|
| 983 |
+
except Exception:
|
| 984 |
+
raise HTTPException(status_code=400, detail="Invalid JSON")
|
| 985 |
+
text = body.get("text", "")
|
| 986 |
+
question = body.get("question", "")
|
| 987 |
+
if not text or not question:
|
| 988 |
+
return {"ok": False, "error": "text and question required"}
|
| 989 |
+
task_id = str(uuid.uuid4())
|
| 990 |
+
_analyze_tasks[task_id] = {"status": "processing", "result": None}
|
| 991 |
+
_analyze_pool.submit(_do_analyze_text_async, task_id, text, question)
|
| 992 |
+
return {"ok": True, "task_id": task_id}
|
| 993 |
+
|
| 994 |
+
@app.get("/analyze-text/check/{task_id}")
|
| 995 |
+
async def analyze_text_check(task_id: str):
|
| 996 |
+
return await analyze_doc_check(task_id)
|
| 997 |
+
|
| 998 |
+
@app.get("/analyze-doc/check/{task_id}")
|
| 999 |
+
async def analyze_doc_check(task_id: str):
|
| 1000 |
+
task = _analyze_tasks.get(task_id)
|
| 1001 |
+
if not task:
|
| 1002 |
+
return {"ok": False, "status": "error", "result": "任务ID不存在"}
|
| 1003 |
+
if task["status"] in ("done", "error"):
|
| 1004 |
+
result_copy = task.copy()
|
| 1005 |
+
del _analyze_tasks[task_id]
|
| 1006 |
+
resp = {"ok": True, "status": result_copy["status"], "result": result_copy["result"]}
|
| 1007 |
+
if result_copy.get("doc_text"):
|
| 1008 |
+
resp["doc_text"] = result_copy["doc_text"]
|
| 1009 |
+
return resp
|
| 1010 |
+
return {"ok": True, "status": "processing"}
|
| 1011 |
+
|
| 1012 |
+
def _do_analyze_async(task_id: str, url: str, question: str, max_chars: int):
|
| 1013 |
+
import requests as _req
|
| 1014 |
+
import uuid as _uuid
|
| 1015 |
+
from pathlib import Path
|
| 1016 |
+
try:
|
| 1017 |
+
resp = _req.get(url, timeout=300, stream=True)
|
| 1018 |
+
if resp.status_code != 200:
|
| 1019 |
+
_analyze_tasks[task_id] = {"status": "error", "result": f"下载失败 HTTP {resp.status_code}"}
|
| 1020 |
+
return
|
| 1021 |
+
local_path = Path("/app") / f"inputs/{_uuid.uuid4().hex}.pdf"
|
| 1022 |
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 1023 |
+
with open(local_path, "wb") as f:
|
| 1024 |
+
for chunk in resp.iter_content(chunk_size=65536):
|
| 1025 |
+
f.write(chunk)
|
| 1026 |
+
import pdfplumber
|
| 1027 |
+
text = ""
|
| 1028 |
+
with pdfplumber.open(local_path) as p:
|
| 1029 |
+
for page in p.pages:
|
| 1030 |
+
t = page.extract_text()
|
| 1031 |
+
if t:
|
| 1032 |
+
text += t + "\n"
|
| 1033 |
+
try:
|
| 1034 |
+
local_path.unlink()
|
| 1035 |
+
except Exception:
|
| 1036 |
+
pass
|
| 1037 |
+
if not text.strip():
|
| 1038 |
+
_analyze_tasks[task_id] = {"status": "error", "result": "无法提取文本内容"}
|
| 1039 |
+
return
|
| 1040 |
+
doc_text = text[:max_chars]
|
| 1041 |
+
_do_analyze_text_async(task_id, doc_text, question)
|
| 1042 |
+
except Exception as e:
|
| 1043 |
+
_analyze_tasks[task_id] = {"status": "error", "result": f"分析失败: {e}"}
|
| 1044 |
+
|
| 1045 |
+
def _do_analyze_text_async(task_id: str, doc_text: str, question: str):
|
| 1046 |
+
try:
|
| 1047 |
+
_analyze_tasks[task_id] = {"status": "done", "result": "分析完成", "doc_text": doc_text}
|
| 1048 |
+
except Exception as e:
|
| 1049 |
+
_analyze_tasks[task_id] = {"status": "error", "result": f"分析失败: {e}"}
|
| 1050 |
+
|
| 1051 |
+
@app.post("/task/start")
|
| 1052 |
+
async def task_start(request: Request):
|
| 1053 |
+
try:
|
| 1054 |
+
body = await request.json()
|
| 1055 |
+
except Exception:
|
| 1056 |
+
raise HTTPException(status_code=400, detail="Invalid JSON")
|
| 1057 |
+
task_text = body.get("task", "")
|
| 1058 |
+
chat_id = body.get("chat_id", "")
|
| 1059 |
+
task_type = body.get("task_type") or None
|
| 1060 |
+
history = body.get("history", [])
|
| 1061 |
+
if not task_text:
|
| 1062 |
+
return {"ok": False, "error": "task required"}
|
| 1063 |
+
task_id = str(uuid.uuid4())
|
| 1064 |
+
_task_tasks[task_id] = {"status": "processing", "result": None}
|
| 1065 |
+
_task_pool.submit(_do_task_async, task_id, task_text, str(chat_id), task_type, history)
|
| 1066 |
+
return {"ok": True, "task_id": task_id}
|
| 1067 |
+
|
| 1068 |
+
@app.get("/task/check/{task_id}")
|
| 1069 |
+
async def task_check(task_id: str):
|
| 1070 |
+
task = _task_tasks.get(task_id)
|
| 1071 |
+
if not task:
|
| 1072 |
+
return {"ok": False, "status": "error", "result": "任务ID不存在"}
|
| 1073 |
+
if task["status"] in ("done", "error"):
|
| 1074 |
+
result_copy = task.copy()
|
| 1075 |
+
del _task_tasks[task_id]
|
| 1076 |
+
return {"ok": True, "status": result_copy["status"], "result": result_copy["result"]}
|
| 1077 |
+
return {"ok": True, "status": "processing"}
|
| 1078 |
+
|
| 1079 |
+
def _do_task_async(task_id: str, task_text: str, chat_id: str, task_type: str = None, history: list = None):
|
| 1080 |
+
if history is None:
|
| 1081 |
+
history = []
|
| 1082 |
+
try:
|
| 1083 |
+
from scripts.ai_agent import run_agent_task
|
| 1084 |
+
result = run_agent_task(task_text, history, None, chat_id, "consumer", task_type=task_type)
|
| 1085 |
+
_task_tasks[task_id] = {"status": "done", "result": str(result)}
|
| 1086 |
+
except Exception as e:
|
| 1087 |
+
_task_tasks[task_id] = {"status": "error", "result": f"处理失败: {e}"}
|
| 1088 |
+
|
| 1089 |
+
@app.get("/skill-search")
|
| 1090 |
+
async def skill_search(request: Request):
|
| 1091 |
+
q = request.query_params.get("q", "").strip().lower()
|
| 1092 |
+
if not q:
|
| 1093 |
+
return JSONResponse([])
|
| 1094 |
+
idx = _get_skill_index()
|
| 1095 |
+
results = []
|
| 1096 |
+
for s in idx.get("skills", []):
|
| 1097 |
+
if q in s.get("name", "").lower() or q in s.get("description", "").lower():
|
| 1098 |
+
results.append({"id": s["id"], "name": s["name"], "description": s.get("description", "")[:200]})
|
| 1099 |
+
return JSONResponse(results)
|
| 1100 |
+
|
| 1101 |
+
@app.get("/skill-view")
|
| 1102 |
+
async def skill_view(request: Request):
|
| 1103 |
+
name = request.query_params.get("name", "").strip().lower()
|
| 1104 |
+
if not name:
|
| 1105 |
+
return JSONResponse({"error": "name required"}, status_code=400)
|
| 1106 |
+
idx = _get_skill_index()
|
| 1107 |
+
for s in idx.get("skills", []):
|
| 1108 |
+
sid = s.get("id", "").lower()
|
| 1109 |
+
if name in sid or name in s.get("name", "").lower():
|
| 1110 |
+
readme_url = f"https://raw.githubusercontent.com/hughyonng/OpenWolf/refs/heads/main/skills/library/{sid}/README.md"
|
| 1111 |
+
try:
|
| 1112 |
+
r = requests.get(readme_url, timeout=10)
|
| 1113 |
+
if r.ok:
|
| 1114 |
+
return JSONResponse({"name": s["name"], "content": r.text[:2000]})
|
| 1115 |
+
except Exception:
|
| 1116 |
+
pass
|
| 1117 |
+
return JSONResponse({"error": "not found"}, status_code=404)
|
| 1118 |
+
|
| 1119 |
+
SKILL_INDEX_CACHE = None
|
| 1120 |
+
|
| 1121 |
+
def _get_skill_index():
|
| 1122 |
+
global SKILL_INDEX_CACHE
|
| 1123 |
+
if SKILL_INDEX_CACHE:
|
| 1124 |
+
return SKILL_INDEX_CACHE
|
| 1125 |
+
gh_token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GITHUB_PAT") or os.environ.get("OPENWOLF_PAT") or ""
|
| 1126 |
+
try:
|
| 1127 |
+
headers = {"Authorization": f"Bearer {gh_token}"} if gh_token else {}
|
| 1128 |
+
r = requests.get("https://raw.githubusercontent.com/hughyonng/OpenWolf/main/skills/library-index.json", headers=headers, timeout=15)
|
| 1129 |
+
if r.status_code == 200:
|
| 1130 |
+
SKILL_INDEX_CACHE = r.json()
|
| 1131 |
+
return SKILL_INDEX_CACHE
|
| 1132 |
+
except Exception:
|
| 1133 |
+
pass
|
| 1134 |
+
try:
|
| 1135 |
+
headers2 = {"Authorization": f"Bearer {gh_token}", "Accept": "application/vnd.github.v3.raw"} if gh_token else {"Accept": "application/vnd.github.v3.raw"}
|
| 1136 |
+
r2 = requests.get("https://api.github.com/repos/hughyonng/OpenWolf/contents/skills/library-index.json", headers=headers2, timeout=15)
|
| 1137 |
+
if r2.status_code == 200:
|
| 1138 |
+
SKILL_INDEX_CACHE = r2.json()
|
| 1139 |
+
return SKILL_INDEX_CACHE
|
| 1140 |
+
except Exception:
|
| 1141 |
+
pass
|
| 1142 |
+
return {"skills": []}
|
| 1143 |
+
|
| 1144 |
+
@app.get("/health")
|
| 1145 |
+
async def health():
|
| 1146 |
+
env_keys = ["OPENROUTER_API_KEY", "GOOGLE_API_KEY", "CHATANYWHERE_API_KEY",
|
| 1147 |
+
"GROQ_API_KEY", "GITHUB_PAT", "GITHUB_REPO",
|
| 1148 |
+
"TELEGRAM_BOT_TOKEN", "TELEGRAM_CHAT_ID", "OPENWOLF_PAT"]
|
| 1149 |
+
env_status = {k: "✅" if os.environ.get(k) else "❌" for k in env_keys}
|
| 1150 |
+
return {"status": "ok", "ready": _ready, "env": env_status}
|
requirements.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ── OpenWolf HF Spaces 依赖 ──
|
| 2 |
+
fastapi>=0.110.0
|
| 3 |
+
uvicorn[standard]>=0.27.0
|
| 4 |
+
requests>=2.31.0
|
| 5 |
+
sentence-transformers>=3.0.0
|
| 6 |
+
faiss-cpu>=1.8.0
|
| 7 |
+
pyyaml>=6.0
|
| 8 |
+
python-docx>=1.1.0
|
| 9 |
+
openpyxl>=3.1.0
|
| 10 |
+
python-pptx>=0.6.23
|
| 11 |
+
pdfplumber>=0.10.0
|
| 12 |
+
xlrd>=1.2.0
|
| 13 |
+
httpx>=0.27.0
|
| 14 |
+
numpy>=1.24.0
|
| 15 |
+
huggingface_hub>=0.24.0
|
| 16 |
+
ebooklib>=0.18
|
| 17 |
+
beautifulsoup4>=4.12
|
| 18 |
+
mobi>=0.3
|