Upload folder using huggingface_hub
Browse files- .claude/settings.local.json +11 -0
- .env.example +35 -36
- .gitattributes +8 -0
- .gitignore +4 -1
- README.md +214 -92
- __pycache__/app.cpython-310.pyc +0 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/embeddings.cpython-310.pyc +0 -0
- __pycache__/embeddings.cpython-312.pyc +0 -0
- __pycache__/ocr_loader.cpython-310.pyc +0 -0
- __pycache__/ocr_loader.cpython-312.pyc +0 -0
- __pycache__/rag_chain.cpython-310.pyc +0 -0
- __pycache__/rag_chain.cpython-312.pyc +0 -0
- __pycache__/run.cpython-310.pyc +0 -0
- __pycache__/run.cpython-312.pyc +0 -0
- __pycache__/text_processor.cpython-310.pyc +0 -0
- __pycache__/text_processor.cpython-312.pyc +0 -0
- __pycache__/vector_store.cpython-310.pyc +0 -0
- __pycache__/vector_store.cpython-312.pyc +0 -0
- app.py +629 -0
- assets/OCR_RAG.mp4 +3 -0
- assets/image-1.png +3 -0
- assets/image-12.png +0 -0
- assets/image-13.png +3 -0
- assets/image-14.png +3 -0
- assets/image-15.png +0 -0
- assets/image-16.png +3 -0
- assets/image-2.png +0 -0
- assets/image-3.png +0 -0
- assets/image-4.png +3 -0
- assets/image-5.png +3 -0
- assets/image-7.png +3 -0
- assets/image.png +3 -0
- config.py +124 -0
- embeddings.py +198 -0
- ocr_loader.py +829 -0
- rag_chain.py +440 -0
- requirements.txt +31 -17
- run.py +465 -0
- static/index.html +637 -0
- test.png +3 -0
- text_processor.py +606 -0
- vector_store.py +307 -0
- 国药准字H37020386_布洛芬片.pdf +3 -0
.claude/settings.local.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"WebSearch",
|
| 5 |
+
"Bash(/data/huangjie/miniforge3/envs/cad/bin/python --version)",
|
| 6 |
+
"Bash(/data/huangjie/miniforge3/envs/cad/bin/pip show *)",
|
| 7 |
+
"Bash(timeout 5 /data/huangjie/miniforge3/envs/cad/bin/python -c \"import pymupdf; print\\('pymupdf OK, version:', pymupdf.version\\)\")",
|
| 8 |
+
"Bash(python:*)"
|
| 9 |
+
]
|
| 10 |
+
}
|
| 11 |
+
}
|
.env.example
CHANGED
|
@@ -1,47 +1,46 @@
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
#
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
-
#
|
| 6 |
-
|
| 7 |
-
|
| 8 |
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
VLM_API_PORT = 8011
|
| 15 |
-
VLM_MODEL_NAME = "AXERA-TECH/Qwen3-VL-2B-Instruct"
|
| 16 |
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
LLM_MODEL_NAME = "AXERA-TECH/Qwen3-1.7B"
|
| 24 |
|
| 25 |
|
| 26 |
-
#
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# ASR_API_KEY = "xxx"
|
| 30 |
-
SHERPA_ASR_API_PORT = 8013
|
| 31 |
-
SHERPA_MODEL_FILE = "/root/huangjie/AXERA-TECH/SenseVoice/ax650/model-10-seconds.axmodel"
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
Tokenizer_API_PORT = 8014
|
| 39 |
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
VIDEORAG_SEGMENT_RETRIEVAL_TOP_K = "2"
|
|
|
|
| 1 |
+
# ============================================================
|
| 2 |
+
# 环境变量配置
|
| 3 |
|
| 4 |
+
# .env 配置
|
| 5 |
+
OCR_ENGINE=api # 改为 api 模式
|
| 6 |
+
OCR_API_BASE=http://127.0.0.1:8015/v1 # vLLM 服务地址
|
| 7 |
+
OCR_API_MODEL=AXERA-TECH/PaddleOCR-VL-1.5 # 模型名
|
| 8 |
+
OCR_API_KEY=not-needed
|
| 9 |
+
OCR_TASK=ocr # 任务类型
|
| 10 |
|
| 11 |
|
| 12 |
+
EMBEDDING_MODEL_NAME=AXERA-TECH/Qwen3-Embedding-0.6B
|
| 13 |
+
EMBEDDING_API_BASE=http://127.0.0.1:8014/v1
|
| 14 |
+
EMBEDDING_API_KEY=sk-08ab126e77f04a0c99bb30154ab0876f
|
| 15 |
+
EMBEDDING_BATCH_SIZE=4
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
+
LLM_API_BASE =http://127.0.0.1:8013/v1
|
| 19 |
+
LLM_API_KEY = not-needed
|
| 20 |
+
LLM_MODEL_NAME =AXERA-TECH/Qwen3-1.7B-GPTQ-Int4
|
| 21 |
+
LLM_TEMPERATURE=0.1
|
| 22 |
+
LLM_MAX_TOKENS=2048
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
+
# ---- 向量数据库 ----
|
| 26 |
+
VECTOR_STORE_TYPE=chroma
|
| 27 |
+
CHROMA_COLLECTION_NAME=pdf_ocr_knowledge
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# ---- PDF 渲染 ----
|
| 30 |
+
PDF_RENDER_DPI=300
|
| 31 |
+
MAX_FILE_SIZE_MB=50
|
| 32 |
|
| 33 |
+
# ---- 文本分割与检索 ----
|
| 34 |
+
CHUNK_SIZE=800
|
| 35 |
+
CHUNK_OVERLAP=150
|
| 36 |
+
RETRIEVAL_TOP_K=5
|
|
|
|
| 37 |
|
| 38 |
+
# ---- 日志 ----
|
| 39 |
+
LOG_LEVEL=INFO
|
| 40 |
|
| 41 |
+
OCR_VL_BACKEND=native
|
| 42 |
+
OCR_USE_LAYOUT=false
|
| 43 |
+
OCR_LAYOUT_THRESHOLD=0.5
|
| 44 |
+
OCR_USE_CHART=false
|
| 45 |
+
OCR_MAX_NEW_TOKENS=4096
|
| 46 |
+
OCR_TEMPERATURE=0.0
|
|
|
.gitattributes
CHANGED
|
@@ -44,3 +44,11 @@ image-10.png filter=lfs diff=lfs merge=lfs -text
|
|
| 44 |
image-7.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
image-8.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
image-9.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
image-7.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
image-8.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
image-9.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
assets/image-1.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
assets/image-13.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
assets/image-14.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
assets/image-16.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
assets/image.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
assets/OCR_RAG.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
test.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
国药准字H37020386_布洛芬片.pdf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -1,2 +1,5 @@
|
|
| 1 |
.env
|
| 2 |
-
uphg.py
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.env
|
| 2 |
+
uphg.py
|
| 3 |
+
.claude/
|
| 4 |
+
.vscode/
|
| 5 |
+
__pycache__/
|
README.md
CHANGED
|
@@ -1,40 +1,88 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
基于
|
| 4 |
|
| 5 |
-
|
| 6 |
-
<img src="https://img.shields.io/badge/platform-AX650N-blue" alt="Platform">
|
| 7 |
-
<img src="https://img.shields.io/badge/python-3.10+-green" alt="Python">
|
| 8 |
-
|
| 9 |
-
</p>
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
- **视频智能索引** — 自动分段、特征提取、多模态信息融合(ASR + VLM)
|
| 17 |
-
- **向量检索** — 高效相似度检索与结果融合,支持跨模态查询
|
| 18 |
-
- **自然语言问答** — 用自然语言提问,基于视频内容生成回答
|
| 19 |
|
| 20 |
-
-
|
|
|
|
| 21 |
|
| 22 |
-
##
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|---------|---------|------|
|
| 28 |
-
| **ASR** | [SenseVoiceSmall-axmodel](https://huggingface.co/M5Stack/SenseVoiceSmall-axmodel) | 多语言语音理解模型 |
|
| 29 |
-
| **VLM** | [Qwen3-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/AXERA-TECH/Qwen3-VL-2B-Instruct-GPTQ-Int4) | 多模态视觉语言模型 |
|
| 30 |
-
| **LLM** | [Qwen3-1.7B](https://huggingface.co/AXERA-TECH/Qwen3-1.7B) | 大语言模型 |
|
| 31 |
-
| **Embedding** | [Qwen3-VL-Embedding-2B-AX650](https://huggingface.co/AXERA-TECH/Qwen3-VL-Embedding-2B-AX650-C128_P1280_CTX1407) | 多模态嵌入模型 |
|
| 32 |
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
## 快速开始
|
| 36 |
|
| 37 |
-
### 1.
|
| 38 |
|
| 39 |
```bash
|
| 40 |
pip install -r requirements.txt
|
|
@@ -42,7 +90,7 @@ pip install -r requirements.txt
|
|
| 42 |
|
| 43 |
### 2. 配置环境变量
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
```bash
|
| 48 |
cp .env.example .env
|
|
@@ -53,22 +101,24 @@ cp .env.example .env
|
|
| 53 |
|
| 54 |
```ini
|
| 55 |
# LLM API(OpenAI API 格式)
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
#
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
```
|
| 73 |
|
| 74 |
### 3. 启动模型服务
|
|
@@ -76,86 +126,158 @@ Tokenizer_API_PORT = 8014
|
|
| 76 |
基于 AX650N 芯片启动各模型服务:
|
| 77 |
|
| 78 |
```bash
|
| 79 |
-
#
|
| 80 |
-
axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
axllm serve /root/huangjie/AXERA-TECH/
|
|
|
|
| 84 |
|
| 85 |
-
#
|
| 86 |
-
axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-1.7B --port 8012
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
python VideoAgent/_server/sherpa_asr_server.py
|
| 90 |
|
| 91 |
-
|
| 92 |
-
python
|
| 93 |
```
|
| 94 |
|
| 95 |
-
|
| 96 |
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
```
|
| 102 |
|
| 103 |
-
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
###
|
| 110 |
|
| 111 |
```python
|
| 112 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
#
|
| 115 |
-
|
|
|
|
| 116 |
|
| 117 |
-
#
|
| 118 |
-
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
|
| 122 |
-
|
| 123 |
```
|
| 124 |
|
| 125 |
---
|
| 126 |
|
| 127 |
-
##
|
| 128 |
|
| 129 |
-
### 视频
|
| 130 |
|
| 131 |
-
|
| 132 |
-
### 查询流程
|
| 133 |
|
|
|
|
| 134 |
|
| 135 |
-
|
| 136 |
|
|
|
|
| 137 |
|
| 138 |
-
-
|
| 139 |
|
| 140 |
-
|
| 141 |
|
| 142 |
-
|
| 143 |
-
VideoAgent-AX650N/
|
| 144 |
-
├── VideoAgent/ # 核心包
|
| 145 |
-
│ ├── _llm/ # 模型定义层
|
| 146 |
-
│ ├── _server/ # 服务层(FastAPI)
|
| 147 |
-
│ ├── _storage/ # 存储层
|
| 148 |
-
│ ├── _videoutil/ # 视频处理工具
|
| 149 |
-
│ └── vidrag_pipeline.py # 核心管道
|
| 150 |
-
├── working_dir/ # 运行时数据目录
|
| 151 |
-
├── webui.py # Gradio Web 入口
|
| 152 |
-
├── videorag_longervideos.py # 测试脚本
|
| 153 |
-
└── README.md # 项目文档
|
| 154 |
-
```
|
| 155 |
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
##
|
| 159 |
|
| 160 |
-
|
| 161 |
|
|
|
|
|
|
| 1 |
+
# 基于 OCR + RAG 的文档智能问答系统
|
| 2 |
|
| 3 |
+
基于 **PaddleOCR-VL** + **Qwen3-Embedding** + **Qwen3** + **LangChain RAG** 的文档智能问答系统,支持 PDF、扫描件及常见图片格式的端到端识别与检索问答。
|
| 4 |
|
| 5 |
+
## 模型栈
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
| 模型类型 | 模型名称 | 说明 |
|
| 8 |
+
|---------|---------|------|
|
| 9 |
+
| **OCR** | [PaddleOCR-VL-1.5](https://huggingface.co/AXERA-TECH/PaddleOCR-VL-1.5) | OCR 识别模型 |
|
| 10 |
+
| **LLM** | [Qwen3-1.7B](https://huggingface.co/AXERA-TECH/Qwen3-1.7B-GPTQ-Int4) | 大语言模型 |
|
| 11 |
+
| **Embedding** | [Qwen3-Embedding-0.6B](https://huggingface.co/AXERA-TECH/Qwen3-Embedding-0.6B) | 文本嵌入模型 |
|
| 12 |
|
| 13 |
+
## 支持格式
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
- PDF(文字型 / 扫描版)
|
| 16 |
+
- PNG / JPG / JPEG / BMP / TIF / TIFF
|
| 17 |
|
| 18 |
+
## 架构
|
| 19 |
|
| 20 |
+
```
|
| 21 |
+
文件上传 (PDF/PNG/JPG/BMP/TIF)
|
| 22 |
+
│
|
| 23 |
+
▼
|
| 24 |
+
┌─────────────────────────────────┐
|
| 25 |
+
│ PaddleOCR-VL │
|
| 26 |
+
│ 端到端识别: 文本 + 版面 + 表格 │
|
| 27 |
+
│ 输出: Markdown / JSON │
|
| 28 |
+
└──────────────┬──────────────────┘
|
| 29 |
+
│ LangChain Documents
|
| 30 |
+
▼
|
| 31 |
+
┌─────────────────────────────────┐
|
| 32 |
+
│ 文本处理 │
|
| 33 |
+
│ Markdown 清洗 → 语义感知分割 │
|
| 34 |
+
│ 表格/公式 独立提取 │
|
| 35 |
+
└──────────────┬──────────────────┘
|
| 36 |
+
│ Document Chunks
|
| 37 |
+
▼
|
| 38 |
+
┌─────────────────────────────────┐
|
| 39 |
+
│ Qwen3-Embedding │
|
| 40 |
+
│ instruct-aware 向量嵌入 │
|
| 41 |
+
└──────────────┬──────────────────┘
|
| 42 |
+
│ Vector Embeddings
|
| 43 |
+
▼
|
| 44 |
+
┌─────────────────────────────────┐
|
| 45 |
+
│ Chroma / FAISS 向量数据库 │
|
| 46 |
+
│ 相似度检索 / MMR / 元数据过滤 │
|
| 47 |
+
└──────────────┬──────────────────┘
|
| 48 |
+
│ Top-K 相关文档
|
| 49 |
+
▼
|
| 50 |
+
┌─────────────────────────────────┐
|
| 51 |
+
│ Qwen3-1.7B │
|
| 52 |
+
│ LangChain LCEL RAG 链 │
|
| 53 |
+
│ 多轮对话 + 来源引用 │
|
| 54 |
+
└──────────────┬──────────────────┘
|
| 55 |
+
│
|
| 56 |
+
▼
|
| 57 |
+
┌─────────────────────────────────┐
|
| 58 |
+
│ Web UI (Gradio) │
|
| 59 |
+
│ 上传 | 问答 | 来源 | 状态 │
|
| 60 |
+
└─────────────────────────────────┘
|
| 61 |
+
```
|
| 62 |
|
| 63 |
+
## 项目结构
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
```
|
| 66 |
+
pdfocr/
|
| 67 |
+
├── requirements.txt # Python 依赖
|
| 68 |
+
├── .env.example # 环境变量模板
|
| 69 |
+
├── config.py # 全局配置中心
|
| 70 |
+
├── ocr_loader.py # PaddleOCR-VL 加载器 (支持多格式)
|
| 71 |
+
├── text_processor.py # Markdown 清洗 + 智能分割
|
| 72 |
+
├── embeddings.py # Qwen3-Embedding 向量嵌入
|
| 73 |
+
├── vector_store.py # 向量数据库管理 (Chroma/FAISS)
|
| 74 |
+
├── rag_chain.py # RAG 问答链 (Qwen3)
|
| 75 |
+
├── app.py # Web UI
|
| 76 |
+
└── data/ # 运行时数据
|
| 77 |
+
├── uploads/
|
| 78 |
+
├── ocr_output/
|
| 79 |
+
├── vector_db/
|
| 80 |
+
└── logs/
|
| 81 |
+
```
|
| 82 |
|
| 83 |
## 快速开始
|
| 84 |
|
| 85 |
+
### 1. 环境准备
|
| 86 |
|
| 87 |
```bash
|
| 88 |
pip install -r requirements.txt
|
|
|
|
| 90 |
|
| 91 |
### 2. 配置环境变量
|
| 92 |
|
| 93 |
+
OCR、LLM、Embedding 均通过环境变量配置,兼容 OpenAI API 格式。
|
| 94 |
|
| 95 |
```bash
|
| 96 |
cp .env.example .env
|
|
|
|
| 101 |
|
| 102 |
```ini
|
| 103 |
# LLM API(OpenAI API 格式)
|
| 104 |
+
LLM_API_KEY=not-needed
|
| 105 |
+
LLM_API_BASE=http://127.0.0.1:8013/v1
|
| 106 |
+
LLM_MODEL_NAME=AXERA-TECH/Qwen3-1.7B-GPTQ-Int4
|
| 107 |
+
LLM_TEMPERATURE=0.1
|
| 108 |
+
LLM_MAX_TOKENS=2048
|
| 109 |
+
|
| 110 |
+
# Embedding API
|
| 111 |
+
EMBEDDING_MODEL_NAME=AXERA-TECH/Qwen3-Embedding-0.6B
|
| 112 |
+
EMBEDDING_API_BASE=http://127.0.0.1:8014/v1
|
| 113 |
+
EMBEDDING_API_KEY=not-needed
|
| 114 |
+
EMBEDDING_BATCH_SIZE=4
|
| 115 |
+
|
| 116 |
+
# OCR API
|
| 117 |
+
OCR_ENGINE=api
|
| 118 |
+
OCR_API_BASE=http://127.0.0.1:8015/v1
|
| 119 |
+
OCR_API_MODEL=AXERA-TECH/PaddleOCR-VL-1.5
|
| 120 |
+
OCR_API_KEY=not-needed
|
| 121 |
+
OCR_TASK=ocr
|
| 122 |
```
|
| 123 |
|
| 124 |
### 3. 启动模型服务
|
|
|
|
| 126 |
基于 AX650N 芯片启动各模型服务:
|
| 127 |
|
| 128 |
```bash
|
| 129 |
+
# LLM 服务 — 端口 8013
|
| 130 |
+
axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-1.7B --port 8013
|
| 131 |
+
|
| 132 |
+
# Embedding 服务 — 端口 8014
|
| 133 |
+
axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-Embedding-0.6B --port 8014
|
| 134 |
|
| 135 |
+
# OCR 服务 — 端口 8015
|
| 136 |
+
axllm serve /root/huangjie/AXERA-TECH/PaddleOCR-VL-1.5 --port 8015
|
| 137 |
+
```
|
| 138 |
|
| 139 |
+
## 使用方式
|
|
|
|
| 140 |
|
| 141 |
+
### 1. Web UI(推荐)
|
|
|
|
| 142 |
|
| 143 |
+
```bash
|
| 144 |
+
python app.py
|
| 145 |
```
|
| 146 |
|
| 147 |
+
浏览器访问 **http://localhost:7860**
|
| 148 |
|
| 149 |
+
**问答界面**
|
| 150 |
|
| 151 |
+

|
| 152 |
+
|
| 153 |
+
**预览界面**
|
| 154 |
+
|
| 155 |
+

|
| 156 |
+
|
| 157 |
+
**设置界面**
|
| 158 |
+
|
| 159 |
+

|
| 160 |
+
|
| 161 |
+
### 2. Python API
|
| 162 |
+
|
| 163 |
+
```python
|
| 164 |
+
from rag_chain import PDFRAGPipeline
|
| 165 |
+
|
| 166 |
+
# 初始化流水线
|
| 167 |
+
pipeline = PDFRAGPipeline()
|
| 168 |
+
|
| 169 |
+
# 处理文档 (支持 PDF/PNG/JPG/BMP/TIF)
|
| 170 |
+
pipeline.ingest("document.pdf")
|
| 171 |
+
pipeline.ingest("scan.png")
|
| 172 |
+
|
| 173 |
+
# 问答
|
| 174 |
+
result = pipeline.ask("文档主要内容是什么?")
|
| 175 |
+
print(result["answer"])
|
| 176 |
+
print(result["sources"])
|
| 177 |
+
|
| 178 |
+
# 多轮对话
|
| 179 |
+
result = pipeline.ask_with_history(
|
| 180 |
+
"那第二章呢?",
|
| 181 |
+
chat_history=[
|
| 182 |
+
{"role": "user", "content": "文档主要讲什么?"},
|
| 183 |
+
{"role": "assistant", "content": "文档主要介绍了..."},
|
| 184 |
+
]
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# 流式输出
|
| 188 |
+
for chunk in pipeline.ask_stream("请总结文档"):
|
| 189 |
+
print(chunk, end="", flush=True)
|
| 190 |
```
|
| 191 |
|
| 192 |
+
### 3. 命令行
|
| 193 |
|
| 194 |
+
```bash
|
| 195 |
+
# 直接对文件提问
|
| 196 |
+
python rag_chain.py document.pdf "文档主要内容是什么?"
|
| 197 |
+
|
| 198 |
+
# OCR 识别并输出 Markdown
|
| 199 |
+
python ocr_loader.py scan.png --md
|
| 200 |
+
|
| 201 |
+
# OCR 识别并输出 JSON
|
| 202 |
+
python ocr_loader.py document.pdf --json
|
| 203 |
+
```
|
| 204 |
|
| 205 |
+
### 4. 分步使用
|
| 206 |
|
| 207 |
```python
|
| 208 |
+
from ocr_loader import PaddleOCRLoader
|
| 209 |
+
from text_processor import TextProcessingPipeline
|
| 210 |
+
from vector_store import build_vector_store
|
| 211 |
+
from rag_chain import RAGChain
|
| 212 |
+
|
| 213 |
+
# 1. OCR
|
| 214 |
+
loader = PaddleOCRLoader("document.pdf", dpi=300)
|
| 215 |
+
documents = loader.load()
|
| 216 |
|
| 217 |
+
# 2. 文本处理
|
| 218 |
+
pipeline = TextProcessingPipeline(chunk_size=800, chunk_overlap=150)
|
| 219 |
+
chunks = pipeline.process(documents)
|
| 220 |
|
| 221 |
+
# 3. 向量化
|
| 222 |
+
manager = build_vector_store(chunks)
|
| 223 |
|
| 224 |
+
# 4. 问答
|
| 225 |
+
chain = RAGChain(vector_store_manager=manager)
|
| 226 |
+
result = chain.query("文档主要内容?")
|
| 227 |
```
|
| 228 |
|
| 229 |
---
|
| 230 |
|
| 231 |
+
## 案例演示
|
| 232 |
|
| 233 |
+
### 演示视频
|
| 234 |
|
| 235 |
+
[观看演示视频](assets/OCR_RAG.mp4)
|
|
|
|
| 236 |
|
| 237 |
+
### 使用步骤
|
| 238 |
|
| 239 |
+
**1. 在 AX650N 芯片上启动模型服务**
|
| 240 |
|
| 241 |
+
LLM 服务
|
| 242 |
|
| 243 |
+

|
| 244 |
|
| 245 |
+
Embedding 服务
|
| 246 |
|
| 247 |
+

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
+
OCR 服务
|
| 250 |
+
|
| 251 |
+

|
| 252 |
+
|
| 253 |
+
运行启动服务
|
| 254 |
+
|
| 255 |
+

|
| 256 |
+
|
| 257 |
+
**2. 上传原始文件**
|
| 258 |
+
|
| 259 |
+
支持 PDF / PNG / JPG / BMP / TIF
|
| 260 |
+
|
| 261 |
+

|
| 262 |
+
|
| 263 |
+
**3. 进行 OCR 识别**
|
| 264 |
+
|
| 265 |
+
OCR 识别并输出文本,支持原始文件和 OCR 结果同时查看:
|
| 266 |
+
|
| 267 |
+

|
| 268 |
+
|
| 269 |
+

|
| 270 |
+
|
| 271 |
+
**4. RAG 智能问答**
|
| 272 |
+
|
| 273 |
+
根据输入内容检索相关文本片段并返回结果。
|
| 274 |
+
|
| 275 |
+
例如提问「布洛芬每日用量」,系统检索到说明书中关于用量的文本片段,依据该文本进行回答:
|
| 276 |
+
|
| 277 |
+

|
| 278 |
|
| 279 |
+
## 硬件资源使用
|
| 280 |
|
| 281 |
+
基于 AX650N 平台运行本项目时,内存(CMM)、Flash 占用情况如下:
|
| 282 |
|
| 283 |
+

|
__pycache__/app.cpython-310.pyc
ADDED
|
Binary file (16 kB). View file
|
|
|
__pycache__/app.cpython-312.pyc
ADDED
|
Binary file (20.9 kB). View file
|
|
|
__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
__pycache__/config.cpython-312.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
__pycache__/embeddings.cpython-310.pyc
ADDED
|
Binary file (5.54 kB). View file
|
|
|
__pycache__/embeddings.cpython-312.pyc
ADDED
|
Binary file (8.24 kB). View file
|
|
|
__pycache__/ocr_loader.cpython-310.pyc
ADDED
|
Binary file (22.5 kB). View file
|
|
|
__pycache__/ocr_loader.cpython-312.pyc
ADDED
|
Binary file (37.8 kB). View file
|
|
|
__pycache__/rag_chain.cpython-310.pyc
ADDED
|
Binary file (13 kB). View file
|
|
|
__pycache__/rag_chain.cpython-312.pyc
ADDED
|
Binary file (19.6 kB). View file
|
|
|
__pycache__/run.cpython-310.pyc
ADDED
|
Binary file (13.3 kB). View file
|
|
|
__pycache__/run.cpython-312.pyc
ADDED
|
Binary file (20.6 kB). View file
|
|
|
__pycache__/text_processor.cpython-310.pyc
ADDED
|
Binary file (14.3 kB). View file
|
|
|
__pycache__/text_processor.cpython-312.pyc
ADDED
|
Binary file (22.7 kB). View file
|
|
|
__pycache__/vector_store.cpython-310.pyc
ADDED
|
Binary file (9.13 kB). View file
|
|
|
__pycache__/vector_store.cpython-312.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
OCR RAG 智能问答系统 - Web UI (FastAPI)
|
| 4 |
+
============================================================
|
| 5 |
+
|
| 6 |
+
启动:
|
| 7 |
+
python app.py
|
| 8 |
+
访问: http://localhost:7860
|
| 9 |
+
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import gc
|
| 13 |
+
import time
|
| 14 |
+
import shutil
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import List, Optional, Dict, Any, Tuple
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _apply_env_patches():
|
| 21 |
+
"""尽早修复已知的环境兼容性问题"""
|
| 22 |
+
import sys
|
| 23 |
+
import types
|
| 24 |
+
|
| 25 |
+
# Step 1: Mock `langchain_text_splitters` 以避免其 __init__.py
|
| 26 |
+
# 触发 sentence_transformers → transformers 损坏链
|
| 27 |
+
if "langchain_text_splitters" not in sys.modules:
|
| 28 |
+
mock_lts = types.ModuleType("langchain_text_splitters")
|
| 29 |
+
mock_lts.__path__ = []
|
| 30 |
+
sys.modules["langchain_text_splitters"] = mock_lts
|
| 31 |
+
|
| 32 |
+
# Step 2: 将我们的 RecursiveCharacterTextSplitter 注入到 mock 模块
|
| 33 |
+
mock_lts = sys.modules["langchain_text_splitters"]
|
| 34 |
+
from text_processor import RecursiveCharacterTextSplitter as OurSplitter
|
| 35 |
+
mock_lts.RecursiveCharacterTextSplitter = OurSplitter
|
| 36 |
+
|
| 37 |
+
# Step 3: 确保 torch 对 transformers 可用
|
| 38 |
+
if "torch" not in sys.modules:
|
| 39 |
+
try:
|
| 40 |
+
import torch # noqa: F401
|
| 41 |
+
except ImportError:
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
_apply_env_patches()
|
| 46 |
+
|
| 47 |
+
from fastapi import FastAPI, File, Form, UploadFile, HTTPException
|
| 48 |
+
from fastapi.responses import HTMLResponse, FileResponse, JSONResponse
|
| 49 |
+
from fastapi.staticfiles import StaticFiles
|
| 50 |
+
from pydantic import BaseModel
|
| 51 |
+
from loguru import logger
|
| 52 |
+
|
| 53 |
+
import config
|
| 54 |
+
from rag_chain import PDFRAGPipeline, RAGChain
|
| 55 |
+
from vector_store import VectorStoreManager
|
| 56 |
+
from ocr_loader import PaddleOCRLoader
|
| 57 |
+
from text_processor import TextProcessingPipeline
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ============================================================
|
| 61 |
+
# 全局状态
|
| 62 |
+
# ============================================================
|
| 63 |
+
|
| 64 |
+
_pipeline: Optional[PDFRAGPipeline] = None
|
| 65 |
+
_processed_files: List[Dict[str, Any]] = []
|
| 66 |
+
_chat_history: List[Dict[str, str]] = []
|
| 67 |
+
|
| 68 |
+
# OCR 文本持久化目录
|
| 69 |
+
_OCR_OUTPUT_DIR = config.OCR_OUTPUT_DIR
|
| 70 |
+
_FILES_JSON = _OCR_OUTPUT_DIR / "_files.json"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _load_files_from_disk():
|
| 74 |
+
"""启动时从磁盘恢复已处理文件列表"""
|
| 75 |
+
global _processed_files
|
| 76 |
+
if _FILES_JSON.exists():
|
| 77 |
+
try:
|
| 78 |
+
import json
|
| 79 |
+
data = json.loads(_FILES_JSON.read_text(encoding="utf-8"))
|
| 80 |
+
_processed_files = data.get("files", [])
|
| 81 |
+
logger.info(f"从磁盘恢复 {len(_processed_files)} 个已处理文件")
|
| 82 |
+
except Exception as e:
|
| 83 |
+
logger.warning(f"恢复文件列表失败: {e}")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def _save_files_to_disk():
|
| 87 |
+
"""将已处理文件列表持久化到磁盘"""
|
| 88 |
+
import json
|
| 89 |
+
_FILES_JSON.parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
_FILES_JSON.write_text(
|
| 91 |
+
json.dumps({"files": _processed_files}, ensure_ascii=False, indent=2),
|
| 92 |
+
encoding="utf-8",
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _get_ocr_text_path(filename: str) -> Path:
|
| 97 |
+
"""获取 OCR 文本的磁盘路径"""
|
| 98 |
+
return _OCR_OUTPUT_DIR / f"{Path(filename).stem}.txt"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _save_ocr_text(filename: str, text: str):
|
| 102 |
+
"""保存 OCR 文本到磁盘"""
|
| 103 |
+
path = _get_ocr_text_path(filename)
|
| 104 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
path.write_text(text, encoding="utf-8")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _load_ocr_text(filename: str) -> str:
|
| 109 |
+
"""从磁盘读取 OCR 文本"""
|
| 110 |
+
path = _get_ocr_text_path(filename)
|
| 111 |
+
if path.exists():
|
| 112 |
+
return path.read_text(encoding="utf-8")
|
| 113 |
+
return ""
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _delete_ocr_text(filename: str):
|
| 117 |
+
"""从磁盘删除 OCR 文本"""
|
| 118 |
+
path = _get_ocr_text_path(filename)
|
| 119 |
+
if path.exists():
|
| 120 |
+
path.unlink()
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def get_pipeline() -> PDFRAGPipeline:
|
| 124 |
+
global _pipeline
|
| 125 |
+
if _pipeline is None:
|
| 126 |
+
_pipeline = PDFRAGPipeline(verbose=False)
|
| 127 |
+
return _pipeline
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ============================================================
|
| 131 |
+
# 核心处理逻辑 (从原 Gradio 回调中提取)
|
| 132 |
+
# ============================================================
|
| 133 |
+
|
| 134 |
+
def process_file_impl(
|
| 135 |
+
file_path: Path,
|
| 136 |
+
chunk_size: int = 800,
|
| 137 |
+
chunk_overlap: int = 150,
|
| 138 |
+
) -> Tuple[Dict[str, Any], str]:
|
| 139 |
+
"""处理上传的文件: OCR → 分割 → 向量化入库"""
|
| 140 |
+
global _pipeline, _processed_files, _chat_history
|
| 141 |
+
|
| 142 |
+
suffix = file_path.suffix.lower()
|
| 143 |
+
|
| 144 |
+
if suffix not in config.SUPPORTED_FORMATS:
|
| 145 |
+
raise ValueError(
|
| 146 |
+
f"不支持的文件格式: {suffix}\n支持: {', '.join(sorted(config.SUPPORTED_FORMATS))}"
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
| 150 |
+
if file_size_mb > config.MAX_FILE_SIZE_MB:
|
| 151 |
+
raise ValueError(f"文件过大: {file_size_mb:.1f}MB (限制: {config.MAX_FILE_SIZE_MB}MB)")
|
| 152 |
+
|
| 153 |
+
# 复用 pipeline 对象避免重复创建 LLM 实例
|
| 154 |
+
if _pipeline is None:
|
| 155 |
+
_pipeline = PDFRAGPipeline(
|
| 156 |
+
chunk_size=int(chunk_size),
|
| 157 |
+
chunk_overlap=int(chunk_overlap),
|
| 158 |
+
verbose=False,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
loader = PaddleOCRLoader(str(file_path), verbose=False)
|
| 162 |
+
raw_docs = loader.load()
|
| 163 |
+
|
| 164 |
+
# 逐页写入 OCR 文本到磁盘,避免内存中构建完整副本
|
| 165 |
+
ocr_path = _get_ocr_text_path(file_path.name)
|
| 166 |
+
ocr_path.parent.mkdir(parents=True, exist_ok=True)
|
| 167 |
+
with open(ocr_path, "w", encoding="utf-8") as ocr_f:
|
| 168 |
+
preview_parts = []
|
| 169 |
+
for i, doc in enumerate(raw_docs):
|
| 170 |
+
page_num = doc.metadata.get("page", i + 1)
|
| 171 |
+
ocr_f.write(f"--- 第 {page_num} 页 ---\n{doc.page_content}\n\n")
|
| 172 |
+
if i < 3:
|
| 173 |
+
preview_parts.append(
|
| 174 |
+
f"--- 第 {page_num} 页 ---\n{doc.page_content[:200]}..."
|
| 175 |
+
)
|
| 176 |
+
if len(raw_docs) > 3:
|
| 177 |
+
preview_parts.append(f"\n... (共 {len(raw_docs)} 页/文档)")
|
| 178 |
+
preview = "\n\n".join(preview_parts)
|
| 179 |
+
|
| 180 |
+
# 文本分割
|
| 181 |
+
pipeline = TextProcessingPipeline(
|
| 182 |
+
chunk_size=int(chunk_size),
|
| 183 |
+
chunk_overlap=int(chunk_overlap),
|
| 184 |
+
)
|
| 185 |
+
chunks = pipeline.process(raw_docs)
|
| 186 |
+
|
| 187 |
+
# 释放 raw_docs 引用,让 GC 可以回收
|
| 188 |
+
raw_docs.clear()
|
| 189 |
+
|
| 190 |
+
# 向量化入库
|
| 191 |
+
_pipeline._vector_store_manager = VectorStoreManager(
|
| 192 |
+
store_type=config.VECTOR_STORE_TYPE,
|
| 193 |
+
)
|
| 194 |
+
_pipeline._vector_store_manager.clear()
|
| 195 |
+
_pipeline._vector_store_manager.add_documents(chunks)
|
| 196 |
+
|
| 197 |
+
_pipeline._rag_chain = RAGChain(
|
| 198 |
+
vector_store_manager=_pipeline._vector_store_manager,
|
| 199 |
+
llm=_pipeline.llm,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
_chat_history = []
|
| 203 |
+
|
| 204 |
+
file_info = {
|
| 205 |
+
"name": file_path.name,
|
| 206 |
+
"format": suffix,
|
| 207 |
+
"pages": len(raw_docs) if raw_docs else _count_ocr_pages(ocr_path),
|
| 208 |
+
"chunks": len(chunks),
|
| 209 |
+
"size_mb": round(file_size_mb, 2),
|
| 210 |
+
"time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 211 |
+
"path": str(file_path),
|
| 212 |
+
}
|
| 213 |
+
_processed_files.append(file_info)
|
| 214 |
+
|
| 215 |
+
# 强制 GC 回收 OCR 过程中产生的临时对象
|
| 216 |
+
del chunks
|
| 217 |
+
gc.collect()
|
| 218 |
+
|
| 219 |
+
logger.info(f"文件处理成功: {file_path.name}, {file_info['pages']} 页, {file_info['chunks']} 块")
|
| 220 |
+
return file_info, preview
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
def _count_ocr_pages(ocr_path: Path) -> int:
|
| 224 |
+
"""从保存的 OCR 文件统计页数"""
|
| 225 |
+
try:
|
| 226 |
+
text = ocr_path.read_text(encoding="utf-8")
|
| 227 |
+
return text.count("--- 第") or 1
|
| 228 |
+
except Exception:
|
| 229 |
+
return 1
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def ask_question_impl(question: str) -> Dict[str, Any]:
|
| 233 |
+
"""执行 RAG 问答"""
|
| 234 |
+
global _pipeline, _chat_history
|
| 235 |
+
|
| 236 |
+
if _pipeline is None or not _pipeline.is_ready:
|
| 237 |
+
raise RuntimeError("请先上传并处理文件")
|
| 238 |
+
|
| 239 |
+
result = _pipeline.ask_with_history(question, _chat_history)
|
| 240 |
+
|
| 241 |
+
_chat_history.append({"role": "user", "content": question})
|
| 242 |
+
_chat_history.append({"role": "assistant", "content": result["answer"]})
|
| 243 |
+
# 限制历史长度以防止内存无限增长 (保留最近 20 轮)
|
| 244 |
+
if len(_chat_history) > 40: # 20 pairs
|
| 245 |
+
_chat_history = _chat_history[-40:]
|
| 246 |
+
|
| 247 |
+
sources = []
|
| 248 |
+
for src in result.get("sources", []):
|
| 249 |
+
sources.append({
|
| 250 |
+
"rank": src["rank"],
|
| 251 |
+
"document": src["document"],
|
| 252 |
+
"page": src["page"],
|
| 253 |
+
"content_type": src.get("content_type", ""),
|
| 254 |
+
"content": src["content"][:200],
|
| 255 |
+
})
|
| 256 |
+
|
| 257 |
+
return {"answer": result["answer"], "sources": sources}
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def clear_chat_impl():
|
| 261 |
+
global _chat_history
|
| 262 |
+
_chat_history = []
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def get_system_status_impl() -> Dict[str, Any]:
|
| 266 |
+
global _pipeline, _processed_files
|
| 267 |
+
|
| 268 |
+
def _mask_key(key: str) -> str:
|
| 269 |
+
if not key or key == "not-needed":
|
| 270 |
+
return ""
|
| 271 |
+
if len(key) <= 8:
|
| 272 |
+
return "*" * len(key)
|
| 273 |
+
return key[:4] + "****" + key[-4:]
|
| 274 |
+
|
| 275 |
+
status = {
|
| 276 |
+
"embedding": {
|
| 277 |
+
"model": config.EMBEDDING_MODEL_NAME,
|
| 278 |
+
"api_base": config.EMBEDDING_API_BASE,
|
| 279 |
+
"api_key": _mask_key(config.EMBEDDING_API_KEY),
|
| 280 |
+
},
|
| 281 |
+
"llm": {
|
| 282 |
+
"model": config.LLM_MODEL_NAME,
|
| 283 |
+
"api_base": config.LLM_API_BASE,
|
| 284 |
+
"api_key": _mask_key(config.LLM_API_KEY),
|
| 285 |
+
},
|
| 286 |
+
"ocr": {
|
| 287 |
+
"engine": config.OCR_ENGINE,
|
| 288 |
+
"model": config.OCR_API_MODEL,
|
| 289 |
+
"api_base": config.OCR_API_BASE,
|
| 290 |
+
"api_key": _mask_key(config.OCR_API_KEY),
|
| 291 |
+
},
|
| 292 |
+
"vector_store": config.VECTOR_STORE_TYPE,
|
| 293 |
+
"params": {
|
| 294 |
+
"chunk_size": config.CHUNK_SIZE,
|
| 295 |
+
"chunk_overlap": config.CHUNK_OVERLAP,
|
| 296 |
+
"retrieval_top_k": config.RETRIEVAL_TOP_K,
|
| 297 |
+
},
|
| 298 |
+
"document_count": 0,
|
| 299 |
+
"files": _processed_files,
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
if _pipeline is not None:
|
| 303 |
+
try:
|
| 304 |
+
stats = _pipeline.stats
|
| 305 |
+
status["document_count"] = stats.get("document_count", 0)
|
| 306 |
+
except Exception:
|
| 307 |
+
pass
|
| 308 |
+
|
| 309 |
+
return status
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
def preload_ocr_engine():
|
| 313 |
+
"""启动时预热 OCR 引擎, 避免首次上传等待模型加载"""
|
| 314 |
+
if config.OCR_ENGINE == "paddle":
|
| 315 |
+
try:
|
| 316 |
+
logger.info("预热 PaddleOCR-VL 引擎...")
|
| 317 |
+
from ocr_loader import _get_ocr_vl_pipeline
|
| 318 |
+
_get_ocr_vl_pipeline()
|
| 319 |
+
logger.info("OCR 引擎预热完成 ✓")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
logger.warning(f"OCR 引擎预热跳过: {e}")
|
| 322 |
+
elif config.OCR_ENGINE == "api":
|
| 323 |
+
logger.info(f"OCR API 模式, 跳过预热 (endpoint: {config.OCR_API_BASE})")
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
# ============================================================
|
| 327 |
+
# FastAPI App
|
| 328 |
+
# ============================================================
|
| 329 |
+
|
| 330 |
+
app = FastAPI(title="PDF OCR 智能问答系统", version="2.0")
|
| 331 |
+
|
| 332 |
+
# Static files
|
| 333 |
+
STATIC_DIR = Path(__file__).resolve().parent / "static"
|
| 334 |
+
STATIC_DIR.mkdir(exist_ok=True)
|
| 335 |
+
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
class ChatRequest(BaseModel):
|
| 339 |
+
question: str
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
class ChatResponse(BaseModel):
|
| 343 |
+
answer: str
|
| 344 |
+
sources: List[Dict[str, Any]]
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
# ── Routes ──
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
@app.get("/", response_class=HTMLResponse)
|
| 351 |
+
async def index():
|
| 352 |
+
"""Serve the main frontend"""
|
| 353 |
+
index_path = STATIC_DIR / "index.html"
|
| 354 |
+
if index_path.exists():
|
| 355 |
+
return FileResponse(index_path)
|
| 356 |
+
return HTMLResponse("<h1>Frontend not found</h1>", status_code=404)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
@app.post("/api/upload")
|
| 360 |
+
async def upload_files(
|
| 361 |
+
files: List[UploadFile] = File(...),
|
| 362 |
+
chunk_size: int = Form(800),
|
| 363 |
+
chunk_overlap: int = Form(150),
|
| 364 |
+
):
|
| 365 |
+
"""Upload and process multiple documents"""
|
| 366 |
+
if not files or all(not f.filename for f in files):
|
| 367 |
+
raise HTTPException(400, "No files provided")
|
| 368 |
+
|
| 369 |
+
upload_dir = config.UPLOAD_DIR
|
| 370 |
+
upload_dir.mkdir(parents=True, exist_ok=True)
|
| 371 |
+
|
| 372 |
+
results = []
|
| 373 |
+
all_errors = []
|
| 374 |
+
|
| 375 |
+
for file in files:
|
| 376 |
+
if not file.filename:
|
| 377 |
+
continue
|
| 378 |
+
|
| 379 |
+
tmp_path = upload_dir / file.filename
|
| 380 |
+
try:
|
| 381 |
+
with open(tmp_path, "wb") as f:
|
| 382 |
+
shutil.copyfileobj(file.file, f)
|
| 383 |
+
|
| 384 |
+
file_info, preview = process_file_impl(tmp_path, chunk_size, chunk_overlap)
|
| 385 |
+
|
| 386 |
+
results.append({
|
| 387 |
+
"success": True,
|
| 388 |
+
"name": file_info["name"],
|
| 389 |
+
"format": file_info["format"],
|
| 390 |
+
"pages": file_info["pages"],
|
| 391 |
+
"chunks": file_info["chunks"],
|
| 392 |
+
"size_mb": file_info["size_mb"],
|
| 393 |
+
"time": file_info["time"],
|
| 394 |
+
"preview": preview,
|
| 395 |
+
"message": "处理完成",
|
| 396 |
+
})
|
| 397 |
+
except ValueError as e:
|
| 398 |
+
all_errors.append(f"{file.filename}: {e}")
|
| 399 |
+
except Exception as e:
|
| 400 |
+
logger.error(f"处理失败 {file.filename}: {e}")
|
| 401 |
+
import traceback
|
| 402 |
+
traceback.print_exc()
|
| 403 |
+
all_errors.append(f"{file.filename}: {e}")
|
| 404 |
+
|
| 405 |
+
if not results and all_errors:
|
| 406 |
+
raise HTTPException(500, "; ".join(all_errors))
|
| 407 |
+
|
| 408 |
+
_save_files_to_disk()
|
| 409 |
+
|
| 410 |
+
return {
|
| 411 |
+
"success": True,
|
| 412 |
+
"results": results,
|
| 413 |
+
"errors": all_errors,
|
| 414 |
+
"total": len(results),
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
@app.delete("/api/files/{index}")
|
| 419 |
+
async def delete_file(index: int):
|
| 420 |
+
"""Remove a processed file from the list by index"""
|
| 421 |
+
global _processed_files
|
| 422 |
+
if 0 <= index < len(_processed_files):
|
| 423 |
+
removed = _processed_files.pop(index)
|
| 424 |
+
_delete_ocr_text(removed["name"])
|
| 425 |
+
_save_files_to_disk()
|
| 426 |
+
logger.info(f"已移除文件: {removed['name']}")
|
| 427 |
+
return {"success": True, "removed": removed["name"]}
|
| 428 |
+
raise HTTPException(404, "File index not found")
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
@app.get("/api/preview/{index}")
|
| 432 |
+
async def get_preview(index: int):
|
| 433 |
+
"""Get full OCR text for a processed file (reads from disk)"""
|
| 434 |
+
if 0 <= index < len(_processed_files):
|
| 435 |
+
filename = _processed_files[index]["name"]
|
| 436 |
+
text = _load_ocr_text(filename)
|
| 437 |
+
if text:
|
| 438 |
+
return {"success": True, "text": text, "index": index, "filename": filename}
|
| 439 |
+
return {"success": False, "text": "", "message": "OCR text file not found on disk"}
|
| 440 |
+
raise HTTPException(404, "File index out of range")
|
| 441 |
+
|
| 442 |
+
|
| 443 |
+
@app.get("/api/file/{index}")
|
| 444 |
+
async def get_original_file(index: int):
|
| 445 |
+
"""Serve the original uploaded file for preview"""
|
| 446 |
+
if 0 <= index < len(_processed_files):
|
| 447 |
+
filename = _processed_files[index]["name"]
|
| 448 |
+
# 1) 尝试存储的路径
|
| 449 |
+
file_path = _processed_files[index].get("path", "")
|
| 450 |
+
if file_path and Path(file_path).exists():
|
| 451 |
+
return FileResponse(file_path)
|
| 452 |
+
# 2) 回退: 在 upload 目录中按文件名查找
|
| 453 |
+
fallback = config.UPLOAD_DIR / filename
|
| 454 |
+
if fallback.exists():
|
| 455 |
+
return FileResponse(str(fallback))
|
| 456 |
+
raise HTTPException(404, f"Original file not found: {filename}")
|
| 457 |
+
raise HTTPException(404, f"File index {index} out of range (total: {len(_processed_files)})")
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
@app.post("/api/chat", response_model=ChatResponse)
|
| 461 |
+
async def chat(req: ChatRequest):
|
| 462 |
+
"""Ask a question about the processed document"""
|
| 463 |
+
try:
|
| 464 |
+
result = ask_question_impl(req.question)
|
| 465 |
+
return ChatResponse(**result)
|
| 466 |
+
except RuntimeError as e:
|
| 467 |
+
return {"answer": str(e), "sources": []}
|
| 468 |
+
except Exception as e:
|
| 469 |
+
logger.error(f"问答失败: {e}")
|
| 470 |
+
import traceback
|
| 471 |
+
traceback.print_exc()
|
| 472 |
+
return {"answer": f"问答失败: {str(e)}", "sources": []}
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
@app.delete("/api/chat")
|
| 476 |
+
async def clear_chat():
|
| 477 |
+
"""Clear chat history"""
|
| 478 |
+
clear_chat_impl()
|
| 479 |
+
return {"success": True}
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
@app.get("/api/status")
|
| 483 |
+
async def get_status():
|
| 484 |
+
"""Get system status"""
|
| 485 |
+
return get_system_status_impl()
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
# ── Config API ──
|
| 489 |
+
|
| 490 |
+
CONFIG_KEYS = {
|
| 491 |
+
"EMBEDDING_API_BASE", "EMBEDDING_MODEL_NAME", "EMBEDDING_API_KEY",
|
| 492 |
+
"LLM_API_BASE", "LLM_MODEL_NAME", "LLM_API_KEY",
|
| 493 |
+
"OCR_API_BASE", "OCR_API_MODEL", "OCR_API_KEY", "OCR_ENGINE",
|
| 494 |
+
"CHUNK_SIZE", "CHUNK_OVERLAP", "RETRIEVAL_TOP_K",
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
|
| 498 |
+
def _update_env_file(updates: Dict[str, str]):
|
| 499 |
+
"""将配置变更写入 .env 文件"""
|
| 500 |
+
env_path = config.BASE_DIR / ".env"
|
| 501 |
+
if env_path.exists():
|
| 502 |
+
lines = env_path.read_text(encoding="utf-8").splitlines()
|
| 503 |
+
else:
|
| 504 |
+
lines = []
|
| 505 |
+
|
| 506 |
+
updated_keys = set()
|
| 507 |
+
new_lines = []
|
| 508 |
+
for line in lines:
|
| 509 |
+
stripped = line.strip()
|
| 510 |
+
if stripped and not stripped.startswith("#") and "=" in stripped:
|
| 511 |
+
key = stripped.split("=", 1)[0].strip()
|
| 512 |
+
if key in updates:
|
| 513 |
+
new_lines.append(f"{key}={updates[key]}")
|
| 514 |
+
updated_keys.add(key)
|
| 515 |
+
continue
|
| 516 |
+
new_lines.append(line)
|
| 517 |
+
|
| 518 |
+
for k, v in updates.items():
|
| 519 |
+
if k not in updated_keys:
|
| 520 |
+
new_lines.append(f"{k}={v}")
|
| 521 |
+
|
| 522 |
+
env_path.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
@app.get("/api/config")
|
| 526 |
+
async def get_config():
|
| 527 |
+
"""获取当前 API 配置"""
|
| 528 |
+
return {
|
| 529 |
+
"embedding": {
|
| 530 |
+
"api_base": config.EMBEDDING_API_BASE,
|
| 531 |
+
"model_name": config.EMBEDDING_MODEL_NAME,
|
| 532 |
+
"api_key": config.EMBEDDING_API_KEY,
|
| 533 |
+
},
|
| 534 |
+
"llm": {
|
| 535 |
+
"api_base": config.LLM_API_BASE,
|
| 536 |
+
"model_name": config.LLM_MODEL_NAME,
|
| 537 |
+
"api_key": config.LLM_API_KEY,
|
| 538 |
+
},
|
| 539 |
+
"ocr": {
|
| 540 |
+
"engine": config.OCR_ENGINE,
|
| 541 |
+
"api_base": config.OCR_API_BASE,
|
| 542 |
+
"model_name": config.OCR_API_MODEL,
|
| 543 |
+
"api_key": config.OCR_API_KEY,
|
| 544 |
+
},
|
| 545 |
+
"retrieval": {
|
| 546 |
+
"chunk_size": config.CHUNK_SIZE,
|
| 547 |
+
"chunk_overlap": config.CHUNK_OVERLAP,
|
| 548 |
+
"top_k": config.RETRIEVAL_TOP_K,
|
| 549 |
+
},
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
@app.post("/api/config")
|
| 554 |
+
async def update_config(updates: Dict[str, str]):
|
| 555 |
+
"""更新 API 配置 (写入 .env 并即时生效)"""
|
| 556 |
+
import os as _os
|
| 557 |
+
|
| 558 |
+
applied = {}
|
| 559 |
+
for key in updates:
|
| 560 |
+
if key in CONFIG_KEYS:
|
| 561 |
+
applied[key] = str(updates[key])
|
| 562 |
+
_os.environ[key] = str(updates[key])
|
| 563 |
+
|
| 564 |
+
if applied:
|
| 565 |
+
_update_env_file(applied)
|
| 566 |
+
|
| 567 |
+
# 重新加载 config 模块以生效
|
| 568 |
+
import importlib
|
| 569 |
+
importlib.reload(config)
|
| 570 |
+
|
| 571 |
+
# 重置全局单例使新配置生效
|
| 572 |
+
from embeddings import reset_embedding_model
|
| 573 |
+
reset_embedding_model()
|
| 574 |
+
|
| 575 |
+
logger.info(f"配置已更新: {list(applied.keys())}")
|
| 576 |
+
|
| 577 |
+
return {"success": True, "updated": list(applied.keys())}
|
| 578 |
+
|
| 579 |
+
|
| 580 |
+
# ============================================================
|
| 581 |
+
# Main
|
| 582 |
+
# ============================================================
|
| 583 |
+
|
| 584 |
+
def main():
|
| 585 |
+
import uvicorn
|
| 586 |
+
|
| 587 |
+
logger.remove()
|
| 588 |
+
logger.add(
|
| 589 |
+
config.LOG_DIR / "app_{time:YYYY-MM-DD}.log",
|
| 590 |
+
level=config.LOG_LEVEL,
|
| 591 |
+
format=config.LOG_FORMAT,
|
| 592 |
+
rotation="100 MB",
|
| 593 |
+
retention="30 days",
|
| 594 |
+
encoding="utf-8",
|
| 595 |
+
)
|
| 596 |
+
logger.add(
|
| 597 |
+
lambda msg: print(msg, end=""),
|
| 598 |
+
level="INFO",
|
| 599 |
+
format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
| 600 |
+
colorize=True,
|
| 601 |
+
)
|
| 602 |
+
|
| 603 |
+
logger.info("=" * 50)
|
| 604 |
+
logger.info(" PDF OCR 智能问答系统 启动中...")
|
| 605 |
+
logger.info("=" * 50)
|
| 606 |
+
logger.info(f" OCR: PaddleOCR-VL-1.5 ({config.OCR_VL_BACKEND})")
|
| 607 |
+
logger.info(f" 嵌入: {config.EMBEDDING_MODEL_NAME} (API: {config.EMBEDDING_API_BASE})")
|
| 608 |
+
logger.info(f" LLM: {config.LLM_MODEL_NAME} (API: {config.LLM_API_BASE})")
|
| 609 |
+
logger.info(f" OCR: {config.OCR_ENGINE} ({config.OCR_API_BASE if config.OCR_ENGINE == 'api' else 'local'})")
|
| 610 |
+
logger.info(f" 向量数据库: {config.VECTOR_STORE_TYPE}")
|
| 611 |
+
logger.info(f" 支持格式: {sorted(config.SUPPORTED_FORMATS)}")
|
| 612 |
+
|
| 613 |
+
# 从磁盘恢复已处理文件列表
|
| 614 |
+
_load_files_from_disk()
|
| 615 |
+
|
| 616 |
+
# 预热 OCR 引擎
|
| 617 |
+
preload_ocr_engine()
|
| 618 |
+
|
| 619 |
+
uvicorn.run(
|
| 620 |
+
app,
|
| 621 |
+
host="0.0.0.0",
|
| 622 |
+
port=7860,
|
| 623 |
+
reload=False,
|
| 624 |
+
log_level="info",
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
+
|
| 628 |
+
if __name__ == "__main__":
|
| 629 |
+
main()
|
assets/OCR_RAG.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c39f595d1822c620fde84ef317c2ea95b24edb8568214e06aec97e3f5251bed
|
| 3 |
+
size 2590191
|
assets/image-1.png
ADDED
|
Git LFS Details
|
assets/image-12.png
ADDED
|
assets/image-13.png
ADDED
|
Git LFS Details
|
assets/image-14.png
ADDED
|
Git LFS Details
|
assets/image-15.png
ADDED
|
assets/image-16.png
ADDED
|
Git LFS Details
|
assets/image-2.png
ADDED
|
assets/image-3.png
ADDED
|
assets/image-4.png
ADDED
|
Git LFS Details
|
assets/image-5.png
ADDED
|
Git LFS Details
|
assets/image-7.png
ADDED
|
Git LFS Details
|
assets/image.png
ADDED
|
Git LFS Details
|
config.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
OCR RAG 智能问答系统 - 全局配置
|
| 4 |
+
============================================================
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
load_dotenv()
|
| 13 |
+
|
| 14 |
+
# ---- 项目路径 ----
|
| 15 |
+
BASE_DIR = Path(__file__).resolve().parent
|
| 16 |
+
DATA_DIR = BASE_DIR / "data"
|
| 17 |
+
UPLOAD_DIR = DATA_DIR / "uploads"
|
| 18 |
+
OCR_OUTPUT_DIR = DATA_DIR / "ocr_output"
|
| 19 |
+
VECTOR_DB_DIR = DATA_DIR / "vector_db"
|
| 20 |
+
LOG_DIR = DATA_DIR / "logs"
|
| 21 |
+
|
| 22 |
+
for d in [DATA_DIR, UPLOAD_DIR, OCR_OUTPUT_DIR, VECTOR_DB_DIR, LOG_DIR]:
|
| 23 |
+
d.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
# ============================================================
|
| 26 |
+
# PaddleOCR-VL-1.5 配置
|
| 27 |
+
# ============================================================
|
| 28 |
+
# PaddleOCR-VL-1.5: 0.9B 视觉语言模型, OmniDocBench v1.5 94.5% 精度
|
| 29 |
+
# 支持: PDF / PNG / JPG / BMP / TIF
|
| 30 |
+
# OCR 引擎:
|
| 31 |
+
# paddle - PaddleOCR pipeline (默认, 版面分析 + 页面级解析, 推荐)
|
| 32 |
+
# transformers - transformers v5 原生推理 (元素级识别, 轻量)
|
| 33 |
+
# PaddleOCR 后端 (仅 engine=paddle 时生效):
|
| 34 |
+
# native - 本地 PaddlePaddle 推理
|
| 35 |
+
# vllm-server - vLLM 服务端 (高吞吐)
|
| 36 |
+
# llama-cpp-server - llama.cpp GGUF (边缘设备)
|
| 37 |
+
|
| 38 |
+
OCR_ENGINE = os.getenv("OCR_ENGINE", "paddle") # paddle / api
|
| 39 |
+
|
| 40 |
+
# OCR API 配置 (OCR_ENGINE=api, 通过 OpenAI 兼容 API 调用)
|
| 41 |
+
# vLLM 部署:
|
| 42 |
+
# python -m vllm.entrypoints.openai.api_server \
|
| 43 |
+
# --model PaddlePaddle/PaddleOCR-VL-1.5 --trust-remote-code --port 8002
|
| 44 |
+
OCR_API_BASE = os.getenv("OCR_API_BASE", "http://127.0.0.1:8002/v1")
|
| 45 |
+
OCR_API_KEY = os.getenv("OCR_API_KEY", "not-needed")
|
| 46 |
+
OCR_API_MODEL = os.getenv("OCR_API_MODEL", "PaddleOCR-VL-1.5")
|
| 47 |
+
OCR_TASK = os.getenv("OCR_TASK", "ocr") # ocr / table / chart / formula / spotting / seal
|
| 48 |
+
|
| 49 |
+
OCR_VL_BACKEND = os.getenv("OCR_VL_BACKEND", "native")
|
| 50 |
+
OCR_VL_SERVER_URL = os.getenv("OCR_VL_SERVER_URL", "http://127.0.0.1:8080/v1")
|
| 51 |
+
|
| 52 |
+
OCR_USE_LAYOUT = os.getenv("OCR_USE_LAYOUT", "true").lower() == "true"
|
| 53 |
+
OCR_LAYOUT_THRESHOLD = float(os.getenv("OCR_LAYOUT_THRESHOLD", "0.5"))
|
| 54 |
+
OCR_USE_CHART = os.getenv("OCR_USE_CHART", "false").lower() == "true"
|
| 55 |
+
|
| 56 |
+
OCR_MAX_NEW_TOKENS = int(os.getenv("OCR_MAX_NEW_TOKENS", "4096"))
|
| 57 |
+
OCR_TEMPERATURE = float(os.getenv("OCR_TEMPERATURE", "0.0"))
|
| 58 |
+
|
| 59 |
+
PDF_RENDER_DPI = int(os.getenv("PDF_RENDER_DPI", "300"))
|
| 60 |
+
|
| 61 |
+
SUPPORTED_IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}
|
| 62 |
+
SUPPORTED_FORMATS = {".pdf"} | SUPPORTED_IMAGE_FORMATS
|
| 63 |
+
|
| 64 |
+
MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
|
| 65 |
+
|
| 66 |
+
# ============================================================
|
| 67 |
+
# 文本分割
|
| 68 |
+
# ============================================================
|
| 69 |
+
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800"))
|
| 70 |
+
CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "150"))
|
| 71 |
+
SEPARATORS = ["\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";", " ", ""]
|
| 72 |
+
|
| 73 |
+
# ============================================================
|
| 74 |
+
# Embedding API 配置 (OpenAI 兼容格式)
|
| 75 |
+
# ============================================================
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
EMBEDDING_MODEL_NAME = os.getenv(
|
| 79 |
+
"EMBEDDING_MODEL_NAME", "Qwen/Qwen3-Embedding-0.6B"
|
| 80 |
+
)
|
| 81 |
+
EMBEDDING_API_BASE = os.getenv(
|
| 82 |
+
"EMBEDDING_API_BASE", "http://127.0.0.1:8001/v1"
|
| 83 |
+
)
|
| 84 |
+
EMBEDDING_API_KEY = os.getenv("EMBEDDING_API_KEY", "not-needed")
|
| 85 |
+
EMBEDDING_BATCH_SIZE = int(os.getenv("EMBEDDING_BATCH_SIZE", "10"))
|
| 86 |
+
|
| 87 |
+
# ============================================================
|
| 88 |
+
# 向量数据库
|
| 89 |
+
# ============================================================
|
| 90 |
+
VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "chroma")
|
| 91 |
+
CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "pdf_ocr_knowledge")
|
| 92 |
+
RETRIEVAL_TOP_K = int(os.getenv("RETRIEVAL_TOP_K", "3"))
|
| 93 |
+
|
| 94 |
+
# ============================================================
|
| 95 |
+
# LLM API 配置 (OpenAI 兼容格式)
|
| 96 |
+
# ============================================================
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
LLM_API_KEY = os.getenv("LLM_API_KEY", "not-needed")
|
| 100 |
+
LLM_API_BASE = os.getenv("LLM_API_BASE", "http://127.0.0.1:8000/v1")
|
| 101 |
+
LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", "Qwen/Qwen3-8B")
|
| 102 |
+
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.1"))
|
| 103 |
+
LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "512"))
|
| 104 |
+
|
| 105 |
+
# ============================================================
|
| 106 |
+
# 系统 Prompt
|
| 107 |
+
# ============================================================
|
| 108 |
+
SYSTEM_PROMPT = """根据以下文档内容,简洁回答用户问题。只依据文档内容回答,不要编造。使用中文。"""
|
| 109 |
+
|
| 110 |
+
RAG_PROMPT_TEMPLATE = """{system_prompt}
|
| 111 |
+
|
| 112 |
+
## 参考文档内容:
|
| 113 |
+
{context}
|
| 114 |
+
|
| 115 |
+
## 用户问题:
|
| 116 |
+
{question}
|
| 117 |
+
|
| 118 |
+
## 回答:"""
|
| 119 |
+
|
| 120 |
+
# ============================================================
|
| 121 |
+
# 日志
|
| 122 |
+
# ============================================================
|
| 123 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
|
| 124 |
+
LOG_FORMAT = "{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}"
|
embeddings.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
向量嵌入模块 (OpenAI 兼容 API)
|
| 4 |
+
============================================================
|
| 5 |
+
直接使用 openai 客户端, 兼容:
|
| 6 |
+
- 阿里云 DashScope (text-embedding-v4 等)
|
| 7 |
+
- vLLM 部署的 Qwen3-Embedding
|
| 8 |
+
- 任意 OpenAI 兼容嵌入服务
|
| 9 |
+
|
| 10 |
+
用法:
|
| 11 |
+
model = get_embedding_model()
|
| 12 |
+
vec = model.embed_query("查询文本")
|
| 13 |
+
vecs = model.embed_documents(["文本1", "文本2"])
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from typing import List, Optional
|
| 17 |
+
import numpy as np
|
| 18 |
+
|
| 19 |
+
from langchain_core.embeddings import Embeddings
|
| 20 |
+
from openai import OpenAI
|
| 21 |
+
|
| 22 |
+
from loguru import logger
|
| 23 |
+
|
| 24 |
+
import config
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ============================================================
|
| 28 |
+
# 通用 OpenAI 兼容嵌入类
|
| 29 |
+
# ============================================================
|
| 30 |
+
|
| 31 |
+
class OpenAICompatEmbeddings(Embeddings):
|
| 32 |
+
"""
|
| 33 |
+
轻量级 OpenAI 兼容嵌入类
|
| 34 |
+
|
| 35 |
+
直接使用 openai 客户端发送请求, 避免 langchain_openai 的额外封装
|
| 36 |
+
导致的 API 兼容性问题 (如 DashScope 的参数校验差异)。
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
model: Optional[str] = None,
|
| 42 |
+
api_key: Optional[str] = None,
|
| 43 |
+
base_url: Optional[str] = None,
|
| 44 |
+
batch_size: Optional[int] = None,
|
| 45 |
+
dimensions: Optional[int] = None,
|
| 46 |
+
):
|
| 47 |
+
self.model = model or config.EMBEDDING_MODEL_NAME
|
| 48 |
+
self.batch_size = batch_size if batch_size is not None else config.EMBEDDING_BATCH_SIZE
|
| 49 |
+
self.dimensions = dimensions
|
| 50 |
+
|
| 51 |
+
self._client = OpenAI(
|
| 52 |
+
api_key=api_key or config.EMBEDDING_API_KEY,
|
| 53 |
+
base_url=base_url or config.EMBEDDING_API_BASE,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
logger.info(
|
| 57 |
+
f"Embedding API 连接: model={self.model}, "
|
| 58 |
+
f"base_url={base_url or config.EMBEDDING_API_BASE}"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
| 62 |
+
"""批量嵌入文档"""
|
| 63 |
+
if not texts:
|
| 64 |
+
return []
|
| 65 |
+
|
| 66 |
+
all_embeddings = []
|
| 67 |
+
|
| 68 |
+
for i in range(0, len(texts), self.batch_size):
|
| 69 |
+
batch = texts[i : i + self.batch_size]
|
| 70 |
+
kwargs = dict(model=self.model, input=batch)
|
| 71 |
+
if self.dimensions:
|
| 72 |
+
kwargs["dimensions"] = self.dimensions
|
| 73 |
+
|
| 74 |
+
response = self._client.embeddings.create(**kwargs)
|
| 75 |
+
# response.data 按输入顺序返回
|
| 76 |
+
batch_embeddings = [item.embedding for item in response.data]
|
| 77 |
+
all_embeddings.extend(batch_embeddings)
|
| 78 |
+
|
| 79 |
+
if len(texts) > self.batch_size:
|
| 80 |
+
logger.debug(
|
| 81 |
+
f"嵌入进度: {min(i + self.batch_size, len(texts))}/{len(texts)}"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
return all_embeddings
|
| 85 |
+
|
| 86 |
+
def embed_query(self, text: str) -> List[float]:
|
| 87 |
+
"""嵌入查询文本"""
|
| 88 |
+
kwargs = dict(model=self.model, input=text)
|
| 89 |
+
if self.dimensions:
|
| 90 |
+
kwargs["dimensions"] = self.dimensions
|
| 91 |
+
|
| 92 |
+
response = self._client.embeddings.create(**kwargs)
|
| 93 |
+
return response.data[0].embedding
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ============================================================
|
| 97 |
+
# 全局单例
|
| 98 |
+
# ============================================================
|
| 99 |
+
|
| 100 |
+
_embedding_model: Optional[Embeddings] = None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_embedding_model(
|
| 104 |
+
model_name: Optional[str] = None,
|
| 105 |
+
api_base: Optional[str] = None,
|
| 106 |
+
) -> Embeddings:
|
| 107 |
+
"""获取全局嵌入模型单例"""
|
| 108 |
+
global _embedding_model
|
| 109 |
+
if _embedding_model is None:
|
| 110 |
+
_embedding_model = OpenAICompatEmbeddings(
|
| 111 |
+
model=model_name,
|
| 112 |
+
base_url=api_base,
|
| 113 |
+
)
|
| 114 |
+
return _embedding_model
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def reset_embedding_model():
|
| 118 |
+
"""重置嵌入模型单例"""
|
| 119 |
+
global _embedding_model
|
| 120 |
+
_embedding_model = None
|
| 121 |
+
logger.info("嵌入模型已重置")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ============================================================
|
| 125 |
+
# 工具函数
|
| 126 |
+
# ============================================================
|
| 127 |
+
|
| 128 |
+
def compute_similarity(vec1: List[float], vec2: List[float]) -> float:
|
| 129 |
+
"""计算余弦相似度"""
|
| 130 |
+
v1, v2 = np.array(vec1), np.array(vec2)
|
| 131 |
+
denom = np.linalg.norm(v1) * np.linalg.norm(v2)
|
| 132 |
+
if denom == 0:
|
| 133 |
+
return 0.0
|
| 134 |
+
return float(np.dot(v1, v2) / denom)
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def batch_embed(
|
| 138 |
+
texts: List[str],
|
| 139 |
+
model: Optional[Embeddings] = None,
|
| 140 |
+
batch_size: Optional[int] = None,
|
| 141 |
+
show_progress: bool = False,
|
| 142 |
+
) -> List[List[float]]:
|
| 143 |
+
"""批量嵌入文本 (支持自定义 batch_size)"""
|
| 144 |
+
if model is None:
|
| 145 |
+
model = get_embedding_model()
|
| 146 |
+
|
| 147 |
+
all_embeddings = []
|
| 148 |
+
total = len(texts)
|
| 149 |
+
bs = batch_size or config.EMBEDDING_BATCH_SIZE
|
| 150 |
+
|
| 151 |
+
for i in range(0, total, bs):
|
| 152 |
+
batch = texts[i : i + bs]
|
| 153 |
+
embeddings = model.embed_documents(batch)
|
| 154 |
+
all_embeddings.extend(embeddings)
|
| 155 |
+
|
| 156 |
+
if show_progress and i + bs < total:
|
| 157 |
+
logger.debug(f"嵌入进度: {min(i + bs, total)}/{total}")
|
| 158 |
+
|
| 159 |
+
return all_embeddings
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ============================================================
|
| 163 |
+
# 测试入口
|
| 164 |
+
# ============================================================
|
| 165 |
+
|
| 166 |
+
if __name__ == "__main__":
|
| 167 |
+
print("测试 Embedding API 连接...\n")
|
| 168 |
+
print(f"API: {config.EMBEDDING_API_BASE}")
|
| 169 |
+
print(f"模型: {config.EMBEDDING_MODEL_NAME}")
|
| 170 |
+
|
| 171 |
+
try:
|
| 172 |
+
model = get_embedding_model()
|
| 173 |
+
|
| 174 |
+
test_texts = [
|
| 175 |
+
"这是第一段测试文本,用于验证嵌入API是否正常工作。",
|
| 176 |
+
"这是第二段完全不同的文本内容,涉及人工智能话题。",
|
| 177 |
+
"向量嵌入是自然语言处理中的基础技术。",
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
print("\n测试单文本嵌入 (embed_query)...")
|
| 181 |
+
query_vec = model.embed_query("嵌入模型测试")
|
| 182 |
+
print(f" 维度: {len(query_vec)}")
|
| 183 |
+
|
| 184 |
+
print("\n测试批量嵌入 (embed_documents)...")
|
| 185 |
+
doc_vecs = model.embed_documents(test_texts)
|
| 186 |
+
print(f" 数量: {len(doc_vecs)}, 维度: {len(doc_vecs[0])}")
|
| 187 |
+
|
| 188 |
+
print("\n测试相似度计算...")
|
| 189 |
+
sim1 = compute_similarity(doc_vecs[2], query_vec)
|
| 190 |
+
sim2 = compute_similarity(doc_vecs[0], query_vec)
|
| 191 |
+
print(f" 查询 vs 向量嵌入文本: {sim1:.4f}")
|
| 192 |
+
print(f" 查询 vs 无关文本: {sim2:.4f}")
|
| 193 |
+
|
| 194 |
+
print(f"\n✓ Embedding API 测试通过")
|
| 195 |
+
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"\n✗ API 连接失败: {e}")
|
| 198 |
+
print(f" 请确保 Embedding API 服务已启动: {config.EMBEDDING_API_BASE}")
|
ocr_loader.py
ADDED
|
@@ -0,0 +1,829 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
PaddleOCR-VL-1.5 文档加载器
|
| 4 |
+
============================================================
|
| 5 |
+
模型: PaddleOCR-VL-1.5 (0.9B 视觉语言模型, OmniDocBench v1.5 94.5% 精度)
|
| 6 |
+
支持格式: PDF / PNG / JPG / JPEG / BMP / TIF / TIFF
|
| 7 |
+
|
| 8 |
+
功能:
|
| 9 |
+
1. 文档 (PDF/图片) → PaddleOCR-VL-1.5 端到端识别
|
| 10 |
+
2. 输出 Markdown/JSON 结构化结果 (含版面/表格/公式/印章)
|
| 11 |
+
3. 转换为 LangChain Document 对象
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import gc
|
| 15 |
+
import time
|
| 16 |
+
import warnings
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from typing import List, Optional, Iterator, Dict, Any, Union
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
|
| 21 |
+
import fitz # PyMuPDF: PDF 页面渲染和元数据提取
|
| 22 |
+
import numpy as np
|
| 23 |
+
from PIL import Image
|
| 24 |
+
|
| 25 |
+
from langchain_core.documents import Document
|
| 26 |
+
|
| 27 |
+
from loguru import logger
|
| 28 |
+
|
| 29 |
+
import config
|
| 30 |
+
|
| 31 |
+
warnings.filterwarnings("ignore")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ============================================================
|
| 35 |
+
# PaddleOCR-VL-1.5 全局单例
|
| 36 |
+
# ============================================================
|
| 37 |
+
|
| 38 |
+
_ocr_vl_pipeline = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _get_ocr_vl_pipeline():
|
| 42 |
+
"""懒加载 PaddleOCR-VL-1.5 模型 (单例)"""
|
| 43 |
+
global _ocr_vl_pipeline
|
| 44 |
+
if _ocr_vl_pipeline is None:
|
| 45 |
+
from paddleocr import PaddleOCRVL
|
| 46 |
+
logger.info(
|
| 47 |
+
f"正在初始化 PaddleOCR-VL-1.5 模型 "
|
| 48 |
+
f"(backend={config.OCR_VL_BACKEND})..."
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
kwargs = dict(
|
| 52 |
+
use_layout_detection=config.OCR_USE_LAYOUT,
|
| 53 |
+
use_chart_recognition=config.OCR_USE_CHART,
|
| 54 |
+
merge_layout_blocks=True,
|
| 55 |
+
layout_threshold=config.OCR_LAYOUT_THRESHOLD,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
if config.OCR_VL_BACKEND == "vllm-server":
|
| 59 |
+
kwargs["vl_rec_backend"] = "vllm-server"
|
| 60 |
+
kwargs["vl_rec_server_url"] = config.OCR_VL_SERVER_URL
|
| 61 |
+
elif config.OCR_VL_BACKEND == "llama-cpp-server":
|
| 62 |
+
kwargs["vl_rec_backend"] = "llama-cpp-server"
|
| 63 |
+
kwargs["vl_rec_server_url"] = config.OCR_VL_SERVER_URL
|
| 64 |
+
|
| 65 |
+
_ocr_vl_pipeline = PaddleOCRVL(**kwargs)
|
| 66 |
+
logger.info("PaddleOCR-VL-1.5 模型初始化完成 ✓")
|
| 67 |
+
return _ocr_vl_pipeline
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ============================================================
|
| 71 |
+
# 数据结构
|
| 72 |
+
# ============================================================
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class OCRResult:
|
| 76 |
+
"""单页/单图 OCR 结果"""
|
| 77 |
+
page_num: int = 0
|
| 78 |
+
markdown_text: str = ""
|
| 79 |
+
json_data: Optional[Dict[str, Any]] = None
|
| 80 |
+
text_blocks: List[Dict[str, Any]] = field(default_factory=list)
|
| 81 |
+
tables: List[Dict[str, Any]] = field(default_factory=list)
|
| 82 |
+
formulas: List[Dict[str, Any]] = field(default_factory=list)
|
| 83 |
+
images_in_page: List[Dict[str, Any]] = field(default_factory=list)
|
| 84 |
+
layout_regions: List[Dict[str, Any]] = field(default_factory=list)
|
| 85 |
+
ocr_time_ms: float = 0.0
|
| 86 |
+
source_format: str = "" # pdf / png / jpg / ...
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ============================================================
|
| 90 |
+
# PaddleOCR-VL-1.5 文本提取器
|
| 91 |
+
# ============================================================
|
| 92 |
+
|
| 93 |
+
class VLOCRExtractor:
|
| 94 |
+
"""使用 PaddleOCR-VL-1.5 从文档中提取结构化内容"""
|
| 95 |
+
|
| 96 |
+
@staticmethod
|
| 97 |
+
def extract(image_or_path: Union[str, Path, np.ndarray]) -> List[OCRResult]:
|
| 98 |
+
"""
|
| 99 |
+
对单张图片或 PDF 执行 OCR 识别
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
image_or_path: 图片路径 / PDF路径 / numpy 数组
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
OCRResult 列表 (PDF 为多页, 图片为单页)
|
| 106 |
+
"""
|
| 107 |
+
pipeline = _get_ocr_vl_pipeline()
|
| 108 |
+
start_time = time.time()
|
| 109 |
+
|
| 110 |
+
logger.info("PaddleOCR-VL 正在推理中 (首次调用较慢, CPU 约 30-60s/页) ...")
|
| 111 |
+
raw_output = pipeline.predict(image_or_path)
|
| 112 |
+
logger.info(f"推理完成, 耗时 {time.time() - start_time:.1f}s")
|
| 113 |
+
results = []
|
| 114 |
+
for i, res in enumerate(raw_output):
|
| 115 |
+
page_result = OCRResult(
|
| 116 |
+
page_num=i + 1,
|
| 117 |
+
ocr_time_ms=(time.time() - start_time) * 1000 / len(raw_output),
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# 尝试获取 structured JSON
|
| 121 |
+
try:
|
| 122 |
+
json_data = res.json
|
| 123 |
+
if json_data:
|
| 124 |
+
page_result.json_data = json_data
|
| 125 |
+
# 解析结构化内容
|
| 126 |
+
page_result.text_blocks = VLOCRExtractor._parse_text_blocks(json_data)
|
| 127 |
+
page_result.tables = VLOCRExtractor._parse_tables(json_data)
|
| 128 |
+
page_result.formulas = VLOCRExtractor._parse_formulas(json_data)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.debug(f"JSON 解析跳过: {e}")
|
| 131 |
+
|
| 132 |
+
# 获取 Markdown 文本
|
| 133 |
+
try:
|
| 134 |
+
md = res.markdown
|
| 135 |
+
if isinstance(md, dict):
|
| 136 |
+
page_result.markdown_text = md.get("text", "") or ""
|
| 137 |
+
elif isinstance(md, str):
|
| 138 |
+
page_result.markdown_text = md
|
| 139 |
+
else:
|
| 140 |
+
page_result.markdown_text = str(md) if md else ""
|
| 141 |
+
except Exception:
|
| 142 |
+
page_result.markdown_text = ""
|
| 143 |
+
|
| 144 |
+
# 回退: markdown 为空时从 JSON blocks 构建文本
|
| 145 |
+
if not page_result.markdown_text and page_result.json_data:
|
| 146 |
+
page_result.markdown_text = VLOCRExtractor._build_text_from_blocks(
|
| 147 |
+
page_result.json_data
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
results.append(page_result)
|
| 151 |
+
|
| 152 |
+
return results
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def extract_text(image_or_path: Union[str, Path, np.ndarray]) -> str:
|
| 156 |
+
"""便捷方法: 只返回纯文本 (合并所有页)"""
|
| 157 |
+
results = VLOCRExtractor.extract(image_or_path)
|
| 158 |
+
return "\n\n".join(r.markdown_text for r in results if r.markdown_text)
|
| 159 |
+
|
| 160 |
+
@staticmethod
|
| 161 |
+
def extract_to_markdown(image_or_path: Union[str, Path, np.ndarray]) -> str:
|
| 162 |
+
"""返回完整的 Markdown 格式文本"""
|
| 163 |
+
return VLOCRExtractor.extract_text(image_or_path)
|
| 164 |
+
|
| 165 |
+
@staticmethod
|
| 166 |
+
def extract_to_json(
|
| 167 |
+
image_or_path: Union[str, Path, np.ndarray],
|
| 168 |
+
save_path: Optional[str] = None,
|
| 169 |
+
) -> Dict[str, Any]:
|
| 170 |
+
"""返回结构化 JSON 或保存到文件"""
|
| 171 |
+
results = VLOCRExtractor.extract(image_or_path)
|
| 172 |
+
output = {
|
| 173 |
+
"pages": [],
|
| 174 |
+
"total_pages": len(results),
|
| 175 |
+
}
|
| 176 |
+
for r in results:
|
| 177 |
+
page_data = {
|
| 178 |
+
"page_num": r.page_num,
|
| 179 |
+
"markdown": r.markdown_text,
|
| 180 |
+
"json": r.json_data,
|
| 181 |
+
"tables": r.tables,
|
| 182 |
+
"formulas": r.formulas,
|
| 183 |
+
}
|
| 184 |
+
output["pages"].append(page_data)
|
| 185 |
+
|
| 186 |
+
if save_path:
|
| 187 |
+
import json
|
| 188 |
+
save_path = Path(save_path)
|
| 189 |
+
save_path.parent.mkdir(parents=True, exist_ok=True)
|
| 190 |
+
with open(save_path, "w", encoding="utf-8") as f:
|
| 191 |
+
json.dump(output, f, ensure_ascii=False, indent=2)
|
| 192 |
+
logger.info(f"OCR 结果已保存: {save_path}")
|
| 193 |
+
|
| 194 |
+
return output
|
| 195 |
+
|
| 196 |
+
# ---- 结构化解析辅助 ----
|
| 197 |
+
|
| 198 |
+
@staticmethod
|
| 199 |
+
def _get_parsing_list(json_data: Dict) -> List[Dict]:
|
| 200 |
+
"""从 PaddleOCR-VL JSON 中提取 parsing_res_list"""
|
| 201 |
+
res = json_data.get("res", json_data)
|
| 202 |
+
return res.get("parsing_res_list", [])
|
| 203 |
+
|
| 204 |
+
@staticmethod
|
| 205 |
+
def _parse_text_blocks(json_data: Dict) -> List[Dict[str, Any]]:
|
| 206 |
+
"""从 parsing_res_list 中提取文本块"""
|
| 207 |
+
blocks = []
|
| 208 |
+
for item in VLOCRExtractor._get_parsing_list(json_data):
|
| 209 |
+
label = item.get("block_label", "")
|
| 210 |
+
content = item.get("block_content", "")
|
| 211 |
+
bbox = item.get("block_bbox", [])
|
| 212 |
+
if content and label not in ("image",):
|
| 213 |
+
blocks.append({
|
| 214 |
+
"type": label,
|
| 215 |
+
"text": content,
|
| 216 |
+
"bbox": bbox,
|
| 217 |
+
})
|
| 218 |
+
return blocks
|
| 219 |
+
|
| 220 |
+
@staticmethod
|
| 221 |
+
def _parse_tables(json_data: Dict) -> List[Dict[str, Any]]:
|
| 222 |
+
"""从 parsing_res_list 中提取表格"""
|
| 223 |
+
tables = []
|
| 224 |
+
for item in VLOCRExtractor._get_parsing_list(json_data):
|
| 225 |
+
if item.get("block_label") == "table":
|
| 226 |
+
tables.append({
|
| 227 |
+
"text": item.get("block_content", ""),
|
| 228 |
+
"html": item.get("block_html", ""),
|
| 229 |
+
"markdown": item.get("block_markdown", ""),
|
| 230 |
+
"bbox": item.get("block_bbox", []),
|
| 231 |
+
})
|
| 232 |
+
return tables
|
| 233 |
+
|
| 234 |
+
@staticmethod
|
| 235 |
+
def _parse_formulas(json_data: Dict) -> List[Dict[str, Any]]:
|
| 236 |
+
"""从 parsing_res_list 中提取公式"""
|
| 237 |
+
formulas = []
|
| 238 |
+
for item in VLOCRExtractor._get_parsing_list(json_data):
|
| 239 |
+
if item.get("block_label") == "formula":
|
| 240 |
+
formulas.append({
|
| 241 |
+
"latex": item.get("block_latex", ""),
|
| 242 |
+
"text": item.get("block_content", ""),
|
| 243 |
+
"bbox": item.get("block_bbox", []),
|
| 244 |
+
})
|
| 245 |
+
return formulas
|
| 246 |
+
|
| 247 |
+
@staticmethod
|
| 248 |
+
def _build_text_from_blocks(json_data: Dict) -> str:
|
| 249 |
+
"""从 parsing_res_list 构建纯文本"""
|
| 250 |
+
lines = []
|
| 251 |
+
for item in VLOCRExtractor._get_parsing_list(json_data):
|
| 252 |
+
label = item.get("block_label", "")
|
| 253 |
+
content = item.get("block_content", "")
|
| 254 |
+
if not content:
|
| 255 |
+
continue
|
| 256 |
+
if label == "table":
|
| 257 |
+
lines.append(f"[表格] {content}")
|
| 258 |
+
elif label == "formula":
|
| 259 |
+
lines.append(f"[公式] {content}")
|
| 260 |
+
elif label in ("paragraph_title", "header"):
|
| 261 |
+
lines.append(f"## {content}")
|
| 262 |
+
elif label == "image":
|
| 263 |
+
continue # 跳过纯图片块
|
| 264 |
+
else:
|
| 265 |
+
lines.append(content)
|
| 266 |
+
return "\n\n".join(lines)
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# ============================================================
|
| 270 |
+
# OCR API 提取器 (OpenAI 兼容格式, 无需本地推理)
|
| 271 |
+
# ============================================================
|
| 272 |
+
|
| 273 |
+
_ocr_api_client = None
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
def _get_ocr_api_client():
|
| 277 |
+
"""懒加载 OCR API 客户端"""
|
| 278 |
+
global _ocr_api_client
|
| 279 |
+
if _ocr_api_client is None:
|
| 280 |
+
from openai import OpenAI
|
| 281 |
+
_ocr_api_client = OpenAI(
|
| 282 |
+
api_key=config.OCR_API_KEY,
|
| 283 |
+
base_url=config.OCR_API_BASE,
|
| 284 |
+
)
|
| 285 |
+
logger.info(
|
| 286 |
+
f"OCR API 连接: model={config.OCR_API_MODEL}, "
|
| 287 |
+
f"base_url={config.OCR_API_BASE}"
|
| 288 |
+
)
|
| 289 |
+
return _ocr_api_client
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class OCRApiExtractor:
|
| 293 |
+
"""
|
| 294 |
+
基于 OpenAI 兼容 API 的 PaddleOCR-VL-1.5 提取器
|
| 295 |
+
|
| 296 |
+
通过 vLLM 或其他 OpenAI 兼容服务调用, 无需本地 GPU 推理。
|
| 297 |
+
|
| 298 |
+
支持任务: ocr / table / formula / chart / spotting / seal
|
| 299 |
+
"""
|
| 300 |
+
|
| 301 |
+
PROMPTS = {
|
| 302 |
+
"ocr": "OCR:",
|
| 303 |
+
"table": "Table Recognition:",
|
| 304 |
+
"formula": "Formula Recognition:",
|
| 305 |
+
"chart": "Chart Recognition:",
|
| 306 |
+
"spotting": "Spotting:",
|
| 307 |
+
"seal": "Seal Recognition:",
|
| 308 |
+
}
|
| 309 |
+
|
| 310 |
+
@staticmethod
|
| 311 |
+
def extract(
|
| 312 |
+
image_or_path: Union[str, Path, np.ndarray],
|
| 313 |
+
task: Optional[str] = None,
|
| 314 |
+
max_new_tokens: int = 2048,
|
| 315 |
+
) -> List[OCRResult]:
|
| 316 |
+
"""
|
| 317 |
+
通过 API 执行 OCR 识别
|
| 318 |
+
|
| 319 |
+
Args:
|
| 320 |
+
image_or_path: 图片路径 / numpy 数组
|
| 321 |
+
task: 任务类型
|
| 322 |
+
max_new_tokens: 最大生成 token 数
|
| 323 |
+
|
| 324 |
+
Returns:
|
| 325 |
+
OCRResult 列表
|
| 326 |
+
"""
|
| 327 |
+
import base64
|
| 328 |
+
import io
|
| 329 |
+
|
| 330 |
+
task = task or config.OCR_TASK
|
| 331 |
+
client = _get_ocr_api_client()
|
| 332 |
+
|
| 333 |
+
start_time = time.time()
|
| 334 |
+
logger.info(f"OCR API 推理中 (task={task}) ...")
|
| 335 |
+
|
| 336 |
+
# 图片 → base64 data URL
|
| 337 |
+
if isinstance(image_or_path, (str, Path)):
|
| 338 |
+
with open(image_or_path, "rb") as f:
|
| 339 |
+
img_bytes = f.read()
|
| 340 |
+
elif isinstance(image_or_path, np.ndarray):
|
| 341 |
+
img = Image.fromarray(image_or_path).convert("RGB")
|
| 342 |
+
buf = io.BytesIO()
|
| 343 |
+
img.save(buf, format="PNG")
|
| 344 |
+
img_bytes = buf.getvalue()
|
| 345 |
+
else:
|
| 346 |
+
img_bytes = image_or_path
|
| 347 |
+
|
| 348 |
+
b64 = base64.b64encode(img_bytes).decode("utf-8")
|
| 349 |
+
image_url = f"data:image/png;base64,{b64}"
|
| 350 |
+
|
| 351 |
+
messages = [{
|
| 352 |
+
"role": "user",
|
| 353 |
+
"content": [
|
| 354 |
+
{"type": "image_url", "image_url": {"url": image_url}},
|
| 355 |
+
{"type": "text", "text": OCRApiExtractor.PROMPTS[task]},
|
| 356 |
+
],
|
| 357 |
+
}]
|
| 358 |
+
|
| 359 |
+
response = client.chat.completions.create(
|
| 360 |
+
model=config.OCR_API_MODEL,
|
| 361 |
+
messages=messages,
|
| 362 |
+
max_tokens=max_new_tokens,
|
| 363 |
+
)
|
| 364 |
+
|
| 365 |
+
result_text = response.choices[0].message.content.strip()
|
| 366 |
+
elapsed = (time.time() - start_time) * 1000
|
| 367 |
+
|
| 368 |
+
result = OCRResult(
|
| 369 |
+
page_num=1,
|
| 370 |
+
markdown_text=result_text,
|
| 371 |
+
ocr_time_ms=elapsed,
|
| 372 |
+
source_format="image",
|
| 373 |
+
text_blocks=[{"type": task, "text": result_text, "bbox": []}],
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
logger.info(f"OCR API 完成, 耗时 {elapsed:.0f}ms, {len(result_text)} 字符")
|
| 377 |
+
return [result]
|
| 378 |
+
|
| 379 |
+
@staticmethod
|
| 380 |
+
def extract_text(
|
| 381 |
+
image_or_path: Union[str, Path, np.ndarray],
|
| 382 |
+
task: Optional[str] = None,
|
| 383 |
+
) -> str:
|
| 384 |
+
"""便捷方法: 只返回识别文本"""
|
| 385 |
+
results = OCRApiExtractor.extract(image_or_path, task=task)
|
| 386 |
+
return "\n".join(r.markdown_text for r in results)
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
# ============================================================
|
| 390 |
+
# 统一提取器入口
|
| 391 |
+
# ============================================================
|
| 392 |
+
|
| 393 |
+
def _extract_ocr(image_or_path: Union[str, Path, np.ndarray]) -> List[OCRResult]:
|
| 394 |
+
"""根据配置选择 OCR 引擎并执行识别"""
|
| 395 |
+
if config.OCR_ENGINE == "api":
|
| 396 |
+
return OCRApiExtractor.extract(image_or_path)
|
| 397 |
+
else:
|
| 398 |
+
return VLOCRExtractor.extract(image_or_path)
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
# ============================================================
|
| 402 |
+
# PDF 工具
|
| 403 |
+
# ============================================================
|
| 404 |
+
|
| 405 |
+
class PDFUtils:
|
| 406 |
+
"""PDF 处理工具: 渲染和元数据提取"""
|
| 407 |
+
|
| 408 |
+
@staticmethod
|
| 409 |
+
def render_page_to_image(page: fitz.Page, dpi: int = 300) -> np.ndarray:
|
| 410 |
+
"""将 PyMuPDF 页面渲染为 numpy 图片数组 (RGB)"""
|
| 411 |
+
zoom = dpi / 72.0
|
| 412 |
+
matrix = fitz.Matrix(zoom, zoom)
|
| 413 |
+
pix = page.get_pixmap(matrix=matrix)
|
| 414 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 415 |
+
return np.array(img)
|
| 416 |
+
|
| 417 |
+
@staticmethod
|
| 418 |
+
def get_page_count(pdf_path: Path) -> int:
|
| 419 |
+
"""获取 PDF 页数"""
|
| 420 |
+
doc = fitz.open(str(pdf_path))
|
| 421 |
+
count = len(doc)
|
| 422 |
+
doc.close()
|
| 423 |
+
return count
|
| 424 |
+
|
| 425 |
+
@staticmethod
|
| 426 |
+
def is_scanned_pdf(pdf_path: Path, sample_pages: int = 3) -> bool:
|
| 427 |
+
"""
|
| 428 |
+
检测 PDF 是否为扫描版 (图片型 PDF)
|
| 429 |
+
|
| 430 |
+
通过检查前几页是否包含可提取的文本层来判断
|
| 431 |
+
"""
|
| 432 |
+
doc = fitz.open(str(pdf_path))
|
| 433 |
+
text_chars = 0
|
| 434 |
+
pages_to_check = min(sample_pages, len(doc))
|
| 435 |
+
|
| 436 |
+
for i in range(pages_to_check):
|
| 437 |
+
text_chars += len(doc[i].get_text().strip())
|
| 438 |
+
|
| 439 |
+
doc.close()
|
| 440 |
+
# 如果前几页几乎没有文本, 认为是扫描版
|
| 441 |
+
return text_chars < 100 * pages_to_check
|
| 442 |
+
|
| 443 |
+
@staticmethod
|
| 444 |
+
def extract_text_layer(pdf_path: Path) -> List[Dict[str, Any]]:
|
| 445 |
+
"""
|
| 446 |
+
提取 PDF 内嵌文本层 (非 OCR, 用于数字原生 PDF)
|
| 447 |
+
返回每页的文本和元数据
|
| 448 |
+
"""
|
| 449 |
+
doc = fitz.open(str(pdf_path))
|
| 450 |
+
pages = []
|
| 451 |
+
|
| 452 |
+
for i in range(len(doc)):
|
| 453 |
+
page = doc[i]
|
| 454 |
+
text = page.get_text("text")
|
| 455 |
+
if text.strip():
|
| 456 |
+
pages.append({
|
| 457 |
+
"page_num": i + 1,
|
| 458 |
+
"text": text,
|
| 459 |
+
"char_count": len(text),
|
| 460 |
+
"has_text_layer": True,
|
| 461 |
+
})
|
| 462 |
+
|
| 463 |
+
doc.close()
|
| 464 |
+
return pages
|
| 465 |
+
|
| 466 |
+
|
| 467 |
+
# ============================================================
|
| 468 |
+
# LangChain PaddleOCR-VL-1.5 文档加载器
|
| 469 |
+
# ============================================================
|
| 470 |
+
|
| 471 |
+
class PaddleOCRLoader:
|
| 472 |
+
"""
|
| 473 |
+
LangChain 兼容的 PaddleOCR-VL-1.5 文档加载器
|
| 474 |
+
|
| 475 |
+
支持格式: PDF / PNG / JPG / JPEG / BMP / TIF / TIFF
|
| 476 |
+
|
| 477 |
+
用法:
|
| 478 |
+
# 加载 PDF
|
| 479 |
+
loader = PaddleOCRLoader("document.pdf")
|
| 480 |
+
documents = loader.load()
|
| 481 |
+
|
| 482 |
+
# 加载图片
|
| 483 |
+
loader = PaddleOCRLoader("scan.png")
|
| 484 |
+
documents = loader.load()
|
| 485 |
+
|
| 486 |
+
# 延迟加载 (大文件推荐)
|
| 487 |
+
for doc in loader.lazy_load():
|
| 488 |
+
process(doc)
|
| 489 |
+
"""
|
| 490 |
+
|
| 491 |
+
def __init__(
|
| 492 |
+
self,
|
| 493 |
+
file_path: Union[str, Path],
|
| 494 |
+
dpi: int = config.PDF_RENDER_DPI,
|
| 495 |
+
verbose: bool = True,
|
| 496 |
+
):
|
| 497 |
+
self.file_path = Path(file_path)
|
| 498 |
+
if not self.file_path.exists():
|
| 499 |
+
raise FileNotFoundError(f"文件不存在: {self.file_path}")
|
| 500 |
+
|
| 501 |
+
self.suffix = self.file_path.suffix.lower()
|
| 502 |
+
if self.suffix not in config.SUPPORTED_FORMATS:
|
| 503 |
+
raise ValueError(
|
| 504 |
+
f"不支持的文件格式: {self.suffix}. "
|
| 505 |
+
f"支持: {config.SUPPORTED_FORMATS}"
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
self.dpi = dpi
|
| 509 |
+
self.verbose = verbose
|
| 510 |
+
self._doc_name = self.file_path.stem
|
| 511 |
+
self._is_pdf = (self.suffix == ".pdf")
|
| 512 |
+
|
| 513 |
+
def load(self) -> List[Document]:
|
| 514 |
+
"""完整加载文档, 返回 LangChain Document 列表"""
|
| 515 |
+
return list(self.lazy_load())
|
| 516 |
+
|
| 517 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 518 |
+
"""逐页延迟加载"""
|
| 519 |
+
|
| 520 |
+
if self._is_pdf:
|
| 521 |
+
yield from self._load_pdf()
|
| 522 |
+
else:
|
| 523 |
+
yield from self._load_image()
|
| 524 |
+
|
| 525 |
+
def _load_pdf(self) -> Iterator[Document]:
|
| 526 |
+
"""加载 PDF 文件"""
|
| 527 |
+
total_start = time.time()
|
| 528 |
+
page_count = PDFUtils.get_page_count(self.file_path)
|
| 529 |
+
self._log(f"开始处理 PDF: {self.file_path.name} ({page_count} 页, DPI={self.dpi})")
|
| 530 |
+
|
| 531 |
+
pdf_doc = fitz.open(str(self.file_path))
|
| 532 |
+
|
| 533 |
+
for page_idx in range(page_count):
|
| 534 |
+
page_start = time.time()
|
| 535 |
+
|
| 536 |
+
# 渲染页面为高清图片
|
| 537 |
+
page = pdf_doc[page_idx]
|
| 538 |
+
image = PDFUtils.render_page_to_image(page, dpi=self.dpi)
|
| 539 |
+
|
| 540 |
+
# PaddleOCR-VL-1.5 识别
|
| 541 |
+
results = _extract_ocr(image)
|
| 542 |
+
|
| 543 |
+
# 释放页面图像内存 (高DPI图片可能占用数百MB)
|
| 544 |
+
del image
|
| 545 |
+
|
| 546 |
+
ocr_time = (time.time() - page_start) * 1000
|
| 547 |
+
|
| 548 |
+
for ocr_result in results:
|
| 549 |
+
ocr_result.page_num = page_idx + 1
|
| 550 |
+
ocr_result.source_format = "pdf"
|
| 551 |
+
|
| 552 |
+
text = ocr_result.markdown_text
|
| 553 |
+
if not text and ocr_result.json_data:
|
| 554 |
+
text = self._extract_text_from_json(ocr_result.json_data)
|
| 555 |
+
|
| 556 |
+
if isinstance(text, dict):
|
| 557 |
+
text = text.get("text", "") or ""
|
| 558 |
+
if not text or not str(text).strip():
|
| 559 |
+
self._log(f" 第 {page_idx + 1} 页: 未检测到文本")
|
| 560 |
+
continue
|
| 561 |
+
|
| 562 |
+
# 构建元数据
|
| 563 |
+
metadata = {
|
| 564 |
+
"source": str(self.file_path),
|
| 565 |
+
"document_name": self._doc_name,
|
| 566 |
+
"page": page_idx + 1,
|
| 567 |
+
"total_pages": page_count,
|
| 568 |
+
"ocr_text_length": len(text),
|
| 569 |
+
"ocr_time_ms": round(ocr_time, 1),
|
| 570 |
+
"dpi": self.dpi,
|
| 571 |
+
"source_format": "pdf",
|
| 572 |
+
"tables_count": len(ocr_result.tables),
|
| 573 |
+
"formulas_count": len(ocr_result.formulas),
|
| 574 |
+
"text_blocks_count": len(ocr_result.text_blocks),
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
# 附加表格/公式数据
|
| 578 |
+
if ocr_result.tables:
|
| 579 |
+
metadata["tables_markdown"] = [
|
| 580 |
+
t.get("markdown", "") for t in ocr_result.tables
|
| 581 |
+
]
|
| 582 |
+
metadata["tables_html"] = [
|
| 583 |
+
t.get("html", "") for t in ocr_result.tables
|
| 584 |
+
]
|
| 585 |
+
if ocr_result.formulas:
|
| 586 |
+
metadata["formulas_latex"] = [
|
| 587 |
+
f.get("latex", "") for f in ocr_result.formulas
|
| 588 |
+
]
|
| 589 |
+
|
| 590 |
+
doc = Document(page_content=text, metadata=metadata)
|
| 591 |
+
|
| 592 |
+
self._log(
|
| 593 |
+
f" 第 {page_idx + 1}/{page_count} 页: "
|
| 594 |
+
f"{len(text)} 字符, "
|
| 595 |
+
f"表格={metadata['tables_count']}, "
|
| 596 |
+
f"公式={metadata['formulas_count']}, "
|
| 597 |
+
f"耗时 {ocr_time:.0f}ms"
|
| 598 |
+
)
|
| 599 |
+
|
| 600 |
+
yield doc
|
| 601 |
+
|
| 602 |
+
pdf_doc.close()
|
| 603 |
+
gc.collect() # 强制回收页面渲染残留内存
|
| 604 |
+
self._log(f"PDF 处理完成, 总耗时 {time.time() - total_start:.1f}s")
|
| 605 |
+
|
| 606 |
+
def _load_image(self) -> Iterator[Document]:
|
| 607 |
+
"""加载单张图片"""
|
| 608 |
+
total_start = time.time()
|
| 609 |
+
self._log(f"开始处理图片: {self.file_path.name}")
|
| 610 |
+
|
| 611 |
+
# 验证图片可读
|
| 612 |
+
try:
|
| 613 |
+
img = Image.open(self.file_path)
|
| 614 |
+
img.verify()
|
| 615 |
+
img = Image.open(self.file_path) # verify 后需重新打开
|
| 616 |
+
except Exception as e:
|
| 617 |
+
raise ValueError(f"无法读取图片文件: {e}")
|
| 618 |
+
|
| 619 |
+
# PaddleOCR-VL-1.5 可以直接接受图片路径
|
| 620 |
+
results = _extract_ocr(str(self.file_path))
|
| 621 |
+
ocr_time = (time.time() - total_start) * 1000
|
| 622 |
+
|
| 623 |
+
for ocr_result in results:
|
| 624 |
+
ocr_result.source_format = self.suffix.lstrip(".")
|
| 625 |
+
# print("ocr_result: ",ocr_result)
|
| 626 |
+
text = ocr_result.markdown_text
|
| 627 |
+
|
| 628 |
+
if not text and ocr_result.json_data:
|
| 629 |
+
text = self._extract_text_from_json(ocr_result.json_data)
|
| 630 |
+
|
| 631 |
+
if isinstance(text, dict):
|
| 632 |
+
text = text.get("text", "") or ""
|
| 633 |
+
if not text or not str(text).strip():
|
| 634 |
+
self._log(" 未检测到文本")
|
| 635 |
+
continue
|
| 636 |
+
|
| 637 |
+
metadata = {
|
| 638 |
+
"source": str(self.file_path),
|
| 639 |
+
"document_name": self._doc_name,
|
| 640 |
+
"page": 1,
|
| 641 |
+
"total_pages": 1,
|
| 642 |
+
"ocr_text_length": len(text),
|
| 643 |
+
"ocr_time_ms": round(ocr_time, 1),
|
| 644 |
+
"dpi": self.dpi,
|
| 645 |
+
"source_format": self.suffix.lstrip("."),
|
| 646 |
+
"image_width": img.width,
|
| 647 |
+
"image_height": img.height,
|
| 648 |
+
"tables_count": len(ocr_result.tables),
|
| 649 |
+
"formulas_count": len(ocr_result.formulas),
|
| 650 |
+
"text_blocks_count": len(ocr_result.text_blocks),
|
| 651 |
+
}
|
| 652 |
+
|
| 653 |
+
if ocr_result.tables:
|
| 654 |
+
metadata["tables_markdown"] = [
|
| 655 |
+
t.get("markdown", "") for t in ocr_result.tables
|
| 656 |
+
]
|
| 657 |
+
metadata["tables_html"] = [
|
| 658 |
+
t.get("html", "") for t in ocr_result.tables
|
| 659 |
+
]
|
| 660 |
+
if ocr_result.formulas:
|
| 661 |
+
metadata["formulas_latex"] = [
|
| 662 |
+
f.get("latex", "") for f in ocr_result.formulas
|
| 663 |
+
]
|
| 664 |
+
|
| 665 |
+
doc = Document(page_content=text, metadata=metadata)
|
| 666 |
+
yield doc
|
| 667 |
+
|
| 668 |
+
self._log(f"图片处理完成, 耗时 {time.time() - total_start:.1f}s")
|
| 669 |
+
|
| 670 |
+
def load_with_ocr_results(self) -> List[OCRResult]:
|
| 671 |
+
"""返回 OCRResult 对象列表 (包含更丰富的结构化信息)"""
|
| 672 |
+
if self._is_pdf:
|
| 673 |
+
pdf_doc = fitz.open(str(self.file_path))
|
| 674 |
+
all_results = []
|
| 675 |
+
for page_idx in range(len(pdf_doc)):
|
| 676 |
+
page = pdf_doc[page_idx]
|
| 677 |
+
image = PDFUtils.render_page_to_image(page, dpi=self.dpi)
|
| 678 |
+
results = _extract_ocr(image)
|
| 679 |
+
for r in results:
|
| 680 |
+
r.page_num = page_idx + 1
|
| 681 |
+
r.source_format = "pdf"
|
| 682 |
+
all_results.extend(results)
|
| 683 |
+
pdf_doc.close()
|
| 684 |
+
return all_results
|
| 685 |
+
else:
|
| 686 |
+
results = _extract_ocr(str(self.file_path))
|
| 687 |
+
for r in results:
|
| 688 |
+
r.source_format = self.suffix.lstrip(".")
|
| 689 |
+
return results
|
| 690 |
+
|
| 691 |
+
@staticmethod
|
| 692 |
+
def _extract_text_from_json(json_data: Dict) -> str:
|
| 693 |
+
"""从 PaddleOCR-VL JSON 结构中提取所有文本"""
|
| 694 |
+
return VLOCRExtractor._build_text_from_blocks(json_data)
|
| 695 |
+
|
| 696 |
+
def _log(self, msg: str):
|
| 697 |
+
if self.verbose:
|
| 698 |
+
logger.info(msg)
|
| 699 |
+
|
| 700 |
+
|
| 701 |
+
# ============================================================
|
| 702 |
+
# 批量加载器
|
| 703 |
+
# ============================================================
|
| 704 |
+
|
| 705 |
+
class PaddleOCRDirectoryLoader:
|
| 706 |
+
"""批量加载目录下的所有支持的文档文件"""
|
| 707 |
+
|
| 708 |
+
def __init__(
|
| 709 |
+
self,
|
| 710 |
+
directory: Union[str, Path],
|
| 711 |
+
glob_patterns: Optional[List[str]] = None,
|
| 712 |
+
**loader_kwargs,
|
| 713 |
+
):
|
| 714 |
+
self.directory = Path(directory)
|
| 715 |
+
self.glob_patterns = glob_patterns or [
|
| 716 |
+
"**/*.pdf", "**/*.png", "**/*.jpg", "**/*.jpeg",
|
| 717 |
+
"**/*.bmp", "**/*.tif", "**/*.tiff",
|
| 718 |
+
]
|
| 719 |
+
self.loader_kwargs = loader_kwargs
|
| 720 |
+
|
| 721 |
+
def load(self) -> List[Document]:
|
| 722 |
+
"""加载目录下所有支持的文档"""
|
| 723 |
+
all_docs = []
|
| 724 |
+
files = []
|
| 725 |
+
for pattern in self.glob_patterns:
|
| 726 |
+
files.extend(self.directory.glob(pattern))
|
| 727 |
+
files = sorted(set(files))
|
| 728 |
+
|
| 729 |
+
if not files:
|
| 730 |
+
logger.warning(f"目录 {self.directory} 中未找到支持的文档文件")
|
| 731 |
+
return all_docs
|
| 732 |
+
|
| 733 |
+
logger.info(f"在 {self.directory} 中找到 {len(files)} 个文件")
|
| 734 |
+
|
| 735 |
+
for file_path in files:
|
| 736 |
+
try:
|
| 737 |
+
loader = PaddleOCRLoader(file_path, **self.loader_kwargs)
|
| 738 |
+
docs = loader.load()
|
| 739 |
+
all_docs.extend(docs)
|
| 740 |
+
logger.info(f" ✓ {file_path.name}: {len(docs)} 页/块")
|
| 741 |
+
except Exception as e:
|
| 742 |
+
logger.error(f" ✗ {file_path.name}: {e}")
|
| 743 |
+
|
| 744 |
+
logger.info(f"批量加载完成, 共 {len(all_docs)} 个文档块")
|
| 745 |
+
return all_docs
|
| 746 |
+
|
| 747 |
+
def lazy_load(self) -> Iterator[Document]:
|
| 748 |
+
"""延迟加载"""
|
| 749 |
+
files = []
|
| 750 |
+
for pattern in self.glob_patterns:
|
| 751 |
+
files.extend(self.directory.glob(pattern))
|
| 752 |
+
files = sorted(set(files))
|
| 753 |
+
|
| 754 |
+
for file_path in files:
|
| 755 |
+
try:
|
| 756 |
+
loader = PaddleOCRLoader(file_path, **self.loader_kwargs)
|
| 757 |
+
yield from loader.lazy_load()
|
| 758 |
+
except Exception as e:
|
| 759 |
+
logger.error(f"加载失败 {file_path.name}: {e}")
|
| 760 |
+
|
| 761 |
+
|
| 762 |
+
# ============================================================
|
| 763 |
+
# 便捷函数
|
| 764 |
+
# ============================================================
|
| 765 |
+
|
| 766 |
+
def load_document(file_path: Union[str, Path], **kwargs) -> List[Document]:
|
| 767 |
+
"""便捷函数: 加载单个文档 (自动识别格式)"""
|
| 768 |
+
loader = PaddleOCRLoader(file_path, **kwargs)
|
| 769 |
+
return loader.load()
|
| 770 |
+
|
| 771 |
+
|
| 772 |
+
def load_directory(directory: Union[str, Path], **kwargs) -> List[Document]:
|
| 773 |
+
"""便捷函数: 加载目录下所有文档"""
|
| 774 |
+
loader = PaddleOCRDirectoryLoader(directory, **kwargs)
|
| 775 |
+
return loader.load()
|
| 776 |
+
|
| 777 |
+
|
| 778 |
+
def ocr_to_markdown(file_path: Union[str, Path]) -> str:
|
| 779 |
+
"""便捷函数: OCR 识别并返回 Markdown"""
|
| 780 |
+
return VLOCRExtractor.extract_to_markdown(file_path)
|
| 781 |
+
|
| 782 |
+
|
| 783 |
+
def ocr_to_json(file_path: Union[str, Path], save_path: Optional[str] = None) -> Dict:
|
| 784 |
+
"""便捷函数: OCR 识别并返回 JSON"""
|
| 785 |
+
return VLOCRExtractor.extract_to_json(file_path, save_path)
|
| 786 |
+
|
| 787 |
+
|
| 788 |
+
# ============================================================
|
| 789 |
+
# 测试入口
|
| 790 |
+
# ============================================================
|
| 791 |
+
|
| 792 |
+
if __name__ == "__main__":
|
| 793 |
+
import sys
|
| 794 |
+
|
| 795 |
+
if len(sys.argv) < 2:
|
| 796 |
+
print(f"用法: python {__file__} <file_path> [--json] [--md]")
|
| 797 |
+
print(f"支持格式: {config.SUPPORTED_FORMATS}")
|
| 798 |
+
sys.exit(1)
|
| 799 |
+
|
| 800 |
+
file_path = sys.argv[1]
|
| 801 |
+
output_mode = "doc" # doc / json / md
|
| 802 |
+
if "--json" in sys.argv:
|
| 803 |
+
output_mode = "json"
|
| 804 |
+
elif "--md" in sys.argv:
|
| 805 |
+
output_mode = "md"
|
| 806 |
+
|
| 807 |
+
loader = PaddleOCRLoader(file_path, verbose=True)
|
| 808 |
+
|
| 809 |
+
if output_mode == "json":
|
| 810 |
+
result = ocr_to_json(file_path)
|
| 811 |
+
import json
|
| 812 |
+
print(json.dumps(result, ensure_ascii=False, indent=2)[:5000])
|
| 813 |
+
elif output_mode == "md":
|
| 814 |
+
md = ocr_to_markdown(file_path)
|
| 815 |
+
print(md[:5000])
|
| 816 |
+
else:
|
| 817 |
+
documents = loader.load()
|
| 818 |
+
print(f"\n{'='*60}")
|
| 819 |
+
print(f"共加载 {len(documents)} 页/文档")
|
| 820 |
+
print(f"{'='*60}")
|
| 821 |
+
for i, doc in enumerate(documents):
|
| 822 |
+
print(f"\n--- 第 {doc.metadata.get('page', '?')} 页 "
|
| 823 |
+
f"({len(doc.page_content)} 字符) ---")
|
| 824 |
+
print(doc.page_content[:500])
|
| 825 |
+
if len(doc.page_content) > 500:
|
| 826 |
+
print("...")
|
| 827 |
+
print(f" 元数据: source={doc.metadata.get('document_name')}, "
|
| 828 |
+
f"tables={doc.metadata.get('tables_count', 0)}, "
|
| 829 |
+
f"formulas={doc.metadata.get('formulas_count', 0)}")
|
rag_chain.py
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
RAG 检索增强生成问答链
|
| 4 |
+
============================================================
|
| 5 |
+
LLM: Qwen3-8B (通过 OpenAI 兼容 API 调用)
|
| 6 |
+
嵌入: Qwen3-Embedding (通过 OpenAI 兼容 API 调用)
|
| 7 |
+
|
| 8 |
+
所有模型均通过 API 调用, 无需本地推理:
|
| 9 |
+
- Embedding API: /v1/embeddings
|
| 10 |
+
- LLM API: /v1/chat/completions
|
| 11 |
+
|
| 12 |
+
支持任意 OpenAI 兼容 API:
|
| 13 |
+
- vLLM 部署的 Qwen3 / Llama / DeepSeek 等
|
| 14 |
+
- 第三方 API (DeepSeek, 通义千问, 智谱 GLM 等)
|
| 15 |
+
- OpenAI 官方 API
|
| 16 |
+
|
| 17 |
+
功能:
|
| 18 |
+
1. LangChain LCEL RAG 问答链
|
| 19 |
+
2. 多轮对话
|
| 20 |
+
3. 流式输出
|
| 21 |
+
4. 来源引用
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from typing import List, Optional, Dict, Any, Iterator
|
| 25 |
+
|
| 26 |
+
from langchain_core.documents import Document
|
| 27 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 28 |
+
from langchain_core.runnables import RunnableParallel
|
| 29 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 30 |
+
from langchain_core.language_models import BaseChatModel
|
| 31 |
+
from langchain_core.messages import HumanMessage, SystemMessage
|
| 32 |
+
|
| 33 |
+
from langchain_openai import ChatOpenAI
|
| 34 |
+
|
| 35 |
+
from loguru import logger
|
| 36 |
+
|
| 37 |
+
import config
|
| 38 |
+
from vector_store import VectorStoreManager
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ============================================================
|
| 42 |
+
# LLM 工厂 (纯 API 模式)
|
| 43 |
+
# ============================================================
|
| 44 |
+
|
| 45 |
+
def create_llm(
|
| 46 |
+
model_name: Optional[str] = None,
|
| 47 |
+
api_base: Optional[str] = None,
|
| 48 |
+
api_key: Optional[str] = None,
|
| 49 |
+
temperature: Optional[float] = None,
|
| 50 |
+
max_tokens: Optional[int] = None,
|
| 51 |
+
) -> ChatOpenAI:
|
| 52 |
+
"""
|
| 53 |
+
创建 OpenAI 兼容的 LLM 实例
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
model_name: 模型名称, 如 Qwen/Qwen3-8B
|
| 57 |
+
api_base: API 地址
|
| 58 |
+
api_key: API Key
|
| 59 |
+
temperature: 生成温度
|
| 60 |
+
max_tokens: 最大输出 token 数
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
ChatOpenAI 实例
|
| 64 |
+
"""
|
| 65 |
+
return ChatOpenAI(
|
| 66 |
+
model=model_name or config.LLM_MODEL_NAME,
|
| 67 |
+
api_key=api_key or config.LLM_API_KEY,
|
| 68 |
+
base_url=api_base or config.LLM_API_BASE,
|
| 69 |
+
temperature=temperature or config.LLM_TEMPERATURE,
|
| 70 |
+
max_tokens=max_tokens or config.LLM_MAX_TOKENS,
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ============================================================
|
| 75 |
+
# RAG 问答链
|
| 76 |
+
# ============================================================
|
| 77 |
+
|
| 78 |
+
class RAGChain:
|
| 79 |
+
"""
|
| 80 |
+
RAG 检索增强生成链
|
| 81 |
+
|
| 82 |
+
流程:
|
| 83 |
+
Query → Embedding API 检索 → 上下文格式化 →
|
| 84 |
+
Prompt 模板 → LLM API 生成 → 结构化回答 (含来源)
|
| 85 |
+
|
| 86 |
+
用法:
|
| 87 |
+
rag = RAGChain(vector_store_manager)
|
| 88 |
+
result = rag.query("文档主要内容是什么?")
|
| 89 |
+
"""
|
| 90 |
+
|
| 91 |
+
def __init__(
|
| 92 |
+
self,
|
| 93 |
+
vector_store_manager: VectorStoreManager,
|
| 94 |
+
llm: Optional[BaseChatModel] = None,
|
| 95 |
+
top_k: int = config.RETRIEVAL_TOP_K,
|
| 96 |
+
system_prompt: Optional[str] = None,
|
| 97 |
+
search_type: str = "similarity",
|
| 98 |
+
):
|
| 99 |
+
self.vector_store_manager = vector_store_manager
|
| 100 |
+
self.llm = llm or create_llm()
|
| 101 |
+
self.top_k = top_k
|
| 102 |
+
self.system_prompt = system_prompt or config.SYSTEM_PROMPT
|
| 103 |
+
self.search_type = search_type
|
| 104 |
+
self._chain = self._build_chain()
|
| 105 |
+
|
| 106 |
+
logger.info(
|
| 107 |
+
f"RAG 问答链初始化完成 (LLM={config.LLM_MODEL_NAME}, "
|
| 108 |
+
f"top_k={top_k}, search={search_type})"
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
def _build_chain(self):
|
| 112 |
+
"""使用 LangChain LCEL 构建 RAG 链"""
|
| 113 |
+
prompt = ChatPromptTemplate.from_messages([
|
| 114 |
+
("system", "{system_prompt}"),
|
| 115 |
+
("human", config.RAG_PROMPT_TEMPLATE),
|
| 116 |
+
])
|
| 117 |
+
|
| 118 |
+
chain = (
|
| 119 |
+
RunnableParallel({
|
| 120 |
+
"context": lambda inputs: self._retrieve_and_format(inputs["query"]),
|
| 121 |
+
"question": lambda inputs: inputs["query"],
|
| 122 |
+
"system_prompt": lambda _: self.system_prompt,
|
| 123 |
+
})
|
| 124 |
+
| prompt
|
| 125 |
+
| self.llm
|
| 126 |
+
| StrOutputParser()
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
return chain
|
| 130 |
+
|
| 131 |
+
def _retrieve_and_format(self, query: str) -> str:
|
| 132 |
+
docs = self._retrieve(query)
|
| 133 |
+
return self._format_docs(docs)
|
| 134 |
+
|
| 135 |
+
def _retrieve(self, query: str) -> List[Document]:
|
| 136 |
+
if self.search_type == "mmr":
|
| 137 |
+
return self.vector_store_manager.max_marginal_relevance_search(
|
| 138 |
+
query, k=self.top_k
|
| 139 |
+
)
|
| 140 |
+
elif self.search_type == "similarity_score":
|
| 141 |
+
results = self.vector_store_manager.similarity_search_with_score(
|
| 142 |
+
query, k=self.top_k
|
| 143 |
+
)
|
| 144 |
+
return [doc for doc, _ in results]
|
| 145 |
+
else:
|
| 146 |
+
return self.vector_store_manager.similarity_search(query, k=self.top_k)
|
| 147 |
+
|
| 148 |
+
MAX_CONTEXT_CHARS = 1800 # 总上下文字符上限 (适配小显存 1152 token 限制)
|
| 149 |
+
|
| 150 |
+
@classmethod
|
| 151 |
+
def _format_docs(cls, docs: List[Document]) -> str:
|
| 152 |
+
if not docs:
|
| 153 |
+
return "(未找到相关文档内容)"
|
| 154 |
+
|
| 155 |
+
# 控制每个文档块长度,避免超过小显存的 token 限制
|
| 156 |
+
max_chunk_chars = cls.MAX_CONTEXT_CHARS // max(len(docs), 1)
|
| 157 |
+
|
| 158 |
+
parts = []
|
| 159 |
+
for i, doc in enumerate(docs, 1):
|
| 160 |
+
page = doc.metadata.get("page", "未知")
|
| 161 |
+
doc_name = doc.metadata.get("document_name", "未知文档")
|
| 162 |
+
|
| 163 |
+
content = doc.page_content
|
| 164 |
+
if len(content) > max_chunk_chars:
|
| 165 |
+
content = content[:max_chunk_chars] + "..."
|
| 166 |
+
|
| 167 |
+
header = f"[{i}] {doc_name} p{page}"
|
| 168 |
+
parts.append(f"{header}\n{content}")
|
| 169 |
+
|
| 170 |
+
return "\n\n---\n\n".join(parts)
|
| 171 |
+
|
| 172 |
+
# ---- 查询接口 ----
|
| 173 |
+
|
| 174 |
+
def query(self, question: str) -> Dict[str, Any]:
|
| 175 |
+
"""
|
| 176 |
+
单次问答
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
{"query": str, "answer": str, "sources": [...], "context": str}
|
| 180 |
+
"""
|
| 181 |
+
logger.info(f"RAG 查询: {question[:100]}...")
|
| 182 |
+
|
| 183 |
+
retrieved_docs = self._retrieve(question)
|
| 184 |
+
answer = self._chain.invoke({"query": question})
|
| 185 |
+
|
| 186 |
+
sources = self._build_sources(retrieved_docs)
|
| 187 |
+
|
| 188 |
+
logger.info(f"生成完成: {len(answer)} 字符, {len(sources)} 个来源")
|
| 189 |
+
return {
|
| 190 |
+
"query": question,
|
| 191 |
+
"answer": answer,
|
| 192 |
+
"sources": sources,
|
| 193 |
+
"context": self._format_docs(retrieved_docs),
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
def query_stream(self, question: str) -> Iterator[str]:
|
| 197 |
+
"""流式问答"""
|
| 198 |
+
logger.info(f"RAG 流式查询: {question[:100]}...")
|
| 199 |
+
for chunk in self._chain.stream({"query": question}):
|
| 200 |
+
yield chunk
|
| 201 |
+
|
| 202 |
+
def query_with_history(
|
| 203 |
+
self,
|
| 204 |
+
question: str,
|
| 205 |
+
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 206 |
+
) -> Dict[str, Any]:
|
| 207 |
+
"""带对话历史的多轮问答"""
|
| 208 |
+
chat_history = chat_history or []
|
| 209 |
+
|
| 210 |
+
history_context = self._format_history(chat_history)
|
| 211 |
+
retrieved_docs = self._retrieve(question)
|
| 212 |
+
context = self._format_docs(retrieved_docs)
|
| 213 |
+
|
| 214 |
+
messages = [
|
| 215 |
+
SystemMessage(content=(
|
| 216 |
+
f"{self.system_prompt}\n\n"
|
| 217 |
+
f"## 对话历史:\n{history_context}"
|
| 218 |
+
)),
|
| 219 |
+
HumanMessage(content=config.RAG_PROMPT_TEMPLATE.format(
|
| 220 |
+
system_prompt="",
|
| 221 |
+
context=context,
|
| 222 |
+
question=question,
|
| 223 |
+
)),
|
| 224 |
+
]
|
| 225 |
+
|
| 226 |
+
response = self.llm.invoke(messages)
|
| 227 |
+
answer = response.content
|
| 228 |
+
|
| 229 |
+
return {
|
| 230 |
+
"query": question,
|
| 231 |
+
"answer": answer,
|
| 232 |
+
"sources": self._build_sources(retrieved_docs),
|
| 233 |
+
"context": context,
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
@staticmethod
|
| 237 |
+
def _build_sources(docs: List[Document]) -> List[Dict[str, Any]]:
|
| 238 |
+
return [
|
| 239 |
+
{
|
| 240 |
+
"rank": i,
|
| 241 |
+
"content": doc.page_content[:300],
|
| 242 |
+
"page": doc.metadata.get("page", "未知"),
|
| 243 |
+
"document": doc.metadata.get("document_name", "未知"),
|
| 244 |
+
"content_type": doc.metadata.get("content_type", "text"),
|
| 245 |
+
}
|
| 246 |
+
for i, doc in enumerate(docs, 1)
|
| 247 |
+
]
|
| 248 |
+
|
| 249 |
+
@staticmethod
|
| 250 |
+
def _format_history(chat_history: List[Dict[str, str]]) -> str:
|
| 251 |
+
if not chat_history:
|
| 252 |
+
return "(无历史对话)"
|
| 253 |
+
parts = []
|
| 254 |
+
for turn in chat_history[-8:]: # 仅保留最近 4 轮对话
|
| 255 |
+
role = "用户" if turn.get("role") == "user" else "助手"
|
| 256 |
+
parts.append(f"{role}: {turn.get('content', '')}")
|
| 257 |
+
return "\n".join(parts)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ============================================================
|
| 261 |
+
# PDF 完整问答流水线
|
| 262 |
+
# ============================================================
|
| 263 |
+
|
| 264 |
+
class PDFRAGPipeline:
|
| 265 |
+
"""
|
| 266 |
+
PDF 智能问答完整流水线 (全 API 模式)
|
| 267 |
+
|
| 268 |
+
一步完成: 文档上传 → OCR → 清洗 → 分割 → API嵌入 → 入库 → API问答
|
| 269 |
+
|
| 270 |
+
用法:
|
| 271 |
+
pipeline = PDFRAGPipeline()
|
| 272 |
+
pipeline.ingest("document.pdf")
|
| 273 |
+
result = pipeline.ask("文档主要内容是什么?")
|
| 274 |
+
"""
|
| 275 |
+
|
| 276 |
+
def __init__(
|
| 277 |
+
self,
|
| 278 |
+
llm: Optional[BaseChatModel] = None,
|
| 279 |
+
store_type: Optional[str] = None,
|
| 280 |
+
chunk_size: int = config.CHUNK_SIZE,
|
| 281 |
+
chunk_overlap: int = config.CHUNK_OVERLAP,
|
| 282 |
+
verbose: bool = True,
|
| 283 |
+
):
|
| 284 |
+
self.llm = llm or create_llm()
|
| 285 |
+
self.store_type = store_type or config.VECTOR_STORE_TYPE
|
| 286 |
+
self.chunk_size = chunk_size
|
| 287 |
+
self.chunk_overlap = chunk_overlap
|
| 288 |
+
self.verbose = verbose
|
| 289 |
+
|
| 290 |
+
self._vector_store_manager: Optional[VectorStoreManager] = None
|
| 291 |
+
self._rag_chain: Optional[RAGChain] = None
|
| 292 |
+
|
| 293 |
+
def ingest(self, file_path: str, clear_existing: bool = True) -> int:
|
| 294 |
+
"""
|
| 295 |
+
处理文档并构建向量数据库
|
| 296 |
+
|
| 297 |
+
支持格式: PDF / PNG / JPG / BMP / TIF
|
| 298 |
+
"""
|
| 299 |
+
from ocr_loader import PaddleOCRLoader
|
| 300 |
+
from text_processor import TextProcessingPipeline
|
| 301 |
+
|
| 302 |
+
logger.info(f"开始入库: {file_path}")
|
| 303 |
+
|
| 304 |
+
# Step 1: OCR
|
| 305 |
+
self._log("Step 1/4: PaddleOCR-VL-1.5 识别...")
|
| 306 |
+
loader = PaddleOCRLoader(file_path, verbose=False)
|
| 307 |
+
raw_docs = loader.load()
|
| 308 |
+
self._log(f" ✓ 识别完成: {len(raw_docs)} 页/文档")
|
| 309 |
+
|
| 310 |
+
# Step 2: 处理
|
| 311 |
+
self._log("Step 2/4: 文本清洗与分割...")
|
| 312 |
+
pipeline = TextProcessingPipeline(
|
| 313 |
+
chunk_size=self.chunk_size,
|
| 314 |
+
chunk_overlap=self.chunk_overlap,
|
| 315 |
+
)
|
| 316 |
+
chunks = pipeline.process(raw_docs)
|
| 317 |
+
self._log(f" ✓ 分割完成: {len(chunks)} 个文本块")
|
| 318 |
+
|
| 319 |
+
# Step 3: 向量化 (通过 Embedding API)
|
| 320 |
+
self._log("Step 3/4: Embedding API 向量化...")
|
| 321 |
+
self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
|
| 322 |
+
if clear_existing:
|
| 323 |
+
self._vector_store_manager.clear()
|
| 324 |
+
chunk_count = self._vector_store_manager.add_documents(chunks)
|
| 325 |
+
self._log(f" ✓ 入库完成: {chunk_count} 个文本块")
|
| 326 |
+
|
| 327 |
+
# Step 4: 初始化 RAG
|
| 328 |
+
self._log("Step 4/4: 初始化 RAG 引擎...")
|
| 329 |
+
self._rag_chain = RAGChain(
|
| 330 |
+
vector_store_manager=self._vector_store_manager,
|
| 331 |
+
llm=self.llm,
|
| 332 |
+
)
|
| 333 |
+
self._log(" ✓ 问答引擎就绪")
|
| 334 |
+
self._log("入库完成! 可以开始提问。")
|
| 335 |
+
|
| 336 |
+
return chunk_count
|
| 337 |
+
|
| 338 |
+
def ingest_multiple(self, file_paths: List[str], clear_existing: bool = True) -> int:
|
| 339 |
+
total = 0
|
| 340 |
+
for i, fp in enumerate(file_paths):
|
| 341 |
+
total += self.ingest(fp, clear_existing=(clear_existing and i == 0))
|
| 342 |
+
return total
|
| 343 |
+
|
| 344 |
+
def ask(self, question: str) -> Dict[str, Any]:
|
| 345 |
+
if self._rag_chain is None:
|
| 346 |
+
self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
|
| 347 |
+
if self._vector_store_manager.get_document_count() == 0:
|
| 348 |
+
raise RuntimeError("向量数据库为空! 请先调用 ingest() 处理文档。")
|
| 349 |
+
self._rag_chain = RAGChain(
|
| 350 |
+
vector_store_manager=self._vector_store_manager,
|
| 351 |
+
llm=self.llm,
|
| 352 |
+
)
|
| 353 |
+
return self._rag_chain.query(question)
|
| 354 |
+
|
| 355 |
+
def ask_stream(self, question: str) -> Iterator[str]:
|
| 356 |
+
if self._rag_chain is None:
|
| 357 |
+
raise RuntimeError("请先调用 ingest() 处理文档。")
|
| 358 |
+
return self._rag_chain.query_stream(question)
|
| 359 |
+
|
| 360 |
+
def ask_with_history(
|
| 361 |
+
self, question: str,
|
| 362 |
+
chat_history: Optional[List[Dict[str, str]]] = None,
|
| 363 |
+
) -> Dict[str, Any]:
|
| 364 |
+
if self._rag_chain is None:
|
| 365 |
+
raise RuntimeError("请先调用 ingest() 处理文档。")
|
| 366 |
+
return self._rag_chain.query_with_history(question, chat_history)
|
| 367 |
+
|
| 368 |
+
@property
|
| 369 |
+
def is_ready(self) -> bool:
|
| 370 |
+
try:
|
| 371 |
+
if self._vector_store_manager is None:
|
| 372 |
+
self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
|
| 373 |
+
return self._vector_store_manager.get_document_count() > 0
|
| 374 |
+
except Exception:
|
| 375 |
+
return False
|
| 376 |
+
|
| 377 |
+
@property
|
| 378 |
+
def stats(self) -> Dict[str, Any]:
|
| 379 |
+
if self._vector_store_manager is None:
|
| 380 |
+
return {"status": "not_initialized"}
|
| 381 |
+
return self._vector_store_manager.get_stats()
|
| 382 |
+
|
| 383 |
+
def _log(self, msg: str):
|
| 384 |
+
if self.verbose:
|
| 385 |
+
print(msg)
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# ============================================================
|
| 389 |
+
# 便捷函数
|
| 390 |
+
# ============================================================
|
| 391 |
+
|
| 392 |
+
def quick_qa(file_path: str, question: str) -> Dict[str, Any]:
|
| 393 |
+
"""便捷函数: 直接对文档提问 (一次性)"""
|
| 394 |
+
from ocr_loader import PaddleOCRLoader
|
| 395 |
+
from text_processor import TextProcessingPipeline
|
| 396 |
+
from vector_store import build_vector_store
|
| 397 |
+
|
| 398 |
+
loader = PaddleOCRLoader(file_path, verbose=False)
|
| 399 |
+
raw_docs = loader.load()
|
| 400 |
+
pipeline = TextProcessingPipeline()
|
| 401 |
+
chunks = pipeline.process(raw_docs)
|
| 402 |
+
manager = build_vector_store(chunks, clear_existing=True)
|
| 403 |
+
chain = RAGChain(vector_store_manager=manager)
|
| 404 |
+
return chain.query(question)
|
| 405 |
+
|
| 406 |
+
|
| 407 |
+
# ============================================================
|
| 408 |
+
# 测试入口
|
| 409 |
+
# ============================================================
|
| 410 |
+
|
| 411 |
+
if __name__ == "__main__":
|
| 412 |
+
import sys
|
| 413 |
+
|
| 414 |
+
if len(sys.argv) < 3:
|
| 415 |
+
print(f"用法: python {__file__} <file_path> <question>")
|
| 416 |
+
print(f"示例: python {__file__} document.pdf '文档主要内容是什么?'")
|
| 417 |
+
sys.exit(1)
|
| 418 |
+
|
| 419 |
+
file_path = sys.argv[1]
|
| 420 |
+
question = sys.argv[2]
|
| 421 |
+
|
| 422 |
+
print(f"\n{'='*60}")
|
| 423 |
+
print(f" PDF/文档 智能问答测试")
|
| 424 |
+
print(f" 文件: {file_path}")
|
| 425 |
+
print(f" 问题: {question}")
|
| 426 |
+
print(f"{'='*60}")
|
| 427 |
+
|
| 428 |
+
result = quick_qa(file_path, question)
|
| 429 |
+
|
| 430 |
+
print(f"\n{'='*60}")
|
| 431 |
+
print(f" 回答:")
|
| 432 |
+
print(f"{'='*60}")
|
| 433 |
+
print(result["answer"])
|
| 434 |
+
|
| 435 |
+
print(f"\n{'='*60}")
|
| 436 |
+
print(f" 参考来源:")
|
| 437 |
+
print(f"{'='*60}")
|
| 438 |
+
for src in result["sources"]:
|
| 439 |
+
print(f" [{src['rank']}] {src['document']} 第{src['page']}页 ({src['content_type']})")
|
| 440 |
+
print(f" {src['content'][:150]}...")
|
requirements.txt
CHANGED
|
@@ -1,18 +1,32 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ============================================================
|
| 2 |
+
# PDF OCR 智能问答系统 依赖
|
| 3 |
+
# 模型栈: PaddleOCR-VL-0.9B (API) + Qwen3-Embedding-0.6B (API) + Qwen3-1.7B (API)#
|
| 4 |
+
# 需事先启动 vLLM 或其他兼容 API 服务
|
| 5 |
+
# ============================================================
|
| 6 |
|
| 7 |
+
|
| 8 |
+
# --- PDF & 图片处理 ---
|
| 9 |
+
PyMuPDF>=1.24.0
|
| 10 |
+
Pillow>=10.0.0
|
| 11 |
+
numpy>=1.24.0
|
| 12 |
+
|
| 13 |
+
# --- LangChain 生态 ---
|
| 14 |
+
langchain>=0.3.0
|
| 15 |
+
langchain-core>=0.3.0
|
| 16 |
+
langchain-community>=0.3.0
|
| 17 |
+
langchain-text-splitters>=0.3.0
|
| 18 |
+
langchain-openai>=0.2.0 # OpenAI 兼容 API 客户端 (Embedding + LLM)
|
| 19 |
+
|
| 20 |
+
# --- 向量数据库 ---
|
| 21 |
+
chromadb>=0.5.0
|
| 22 |
+
# faiss-cpu (可选)
|
| 23 |
+
|
| 24 |
+
# --- Web UI ---
|
| 25 |
+
fastapi>=0.110.0
|
| 26 |
+
uvicorn>=0.29.0
|
| 27 |
+
python-multipart>=0.0.9
|
| 28 |
+
|
| 29 |
+
# --- 工具 ---
|
| 30 |
+
python-dotenv>=1.0.0
|
| 31 |
+
tqdm>=4.66.0
|
| 32 |
+
loguru>=0.7.0
|
run.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
============================================================
|
| 4 |
+
PDF OCR 智能问答系统 — 端到端运行脚本
|
| 5 |
+
============================================================
|
| 6 |
+
|
| 7 |
+
用法:
|
| 8 |
+
# 交互模式: 处理文档后进入问答 REPL
|
| 9 |
+
python run.py -f document.pdf
|
| 10 |
+
|
| 11 |
+
# 单次问答
|
| 12 |
+
python run.py -f document.pdf -q "文档主要内容是什么?"
|
| 13 |
+
|
| 14 |
+
# 批量处理多个文档
|
| 15 |
+
python run.py -f doc1.pdf doc2.png scan3.jpg
|
| 16 |
+
|
| 17 |
+
# 指定分块参数
|
| 18 |
+
python run.py -f document.pdf --chunk-size 1000 --chunk-overlap 200
|
| 19 |
+
|
| 20 |
+
# 从已有向量库加载 (跳过 OCR, 直接问答)
|
| 21 |
+
python run.py --load
|
| 22 |
+
|
| 23 |
+
# 清空旧数据重新处理
|
| 24 |
+
python run.py -f document.pdf --clear
|
| 25 |
+
|
| 26 |
+
# 显示检索到的原文
|
| 27 |
+
python run.py -f document.pdf -q "问题" --show-sources
|
| 28 |
+
|
| 29 |
+
环境变量 (或 .env 文件):
|
| 30 |
+
EMBEDDING_API_BASE Embedding API 地址
|
| 31 |
+
EMBEDDING_MODEL_NAME Embedding 模型名
|
| 32 |
+
LLM_API_BASE LLM API 地址
|
| 33 |
+
LLM_API_KEY LLM API Key
|
| 34 |
+
LLM_MODEL_NAME LLM 模型名
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
import argparse
|
| 38 |
+
import json
|
| 39 |
+
import os
|
| 40 |
+
import sys
|
| 41 |
+
import time
|
| 42 |
+
from pathlib import Path
|
| 43 |
+
from typing import List, Optional
|
| 44 |
+
|
| 45 |
+
# ---- 环境补丁 (必须在其他导入之前) ----
|
| 46 |
+
def _patch():
|
| 47 |
+
import types as _types
|
| 48 |
+
if "langchain_text_splitters" not in sys.modules:
|
| 49 |
+
m = _types.ModuleType("langchain_text_splitters")
|
| 50 |
+
m.__path__ = []
|
| 51 |
+
sys.modules["langchain_text_splitters"] = m
|
| 52 |
+
try:
|
| 53 |
+
import torch # noqa: F401
|
| 54 |
+
except ImportError:
|
| 55 |
+
pass
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
_patch()
|
| 59 |
+
|
| 60 |
+
# 项目导入
|
| 61 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent))
|
| 62 |
+
|
| 63 |
+
import config
|
| 64 |
+
from ocr_loader import PaddleOCRLoader
|
| 65 |
+
from text_processor import TextProcessingPipeline, RecursiveCharacterTextSplitter
|
| 66 |
+
from embeddings import get_embedding_model
|
| 67 |
+
from vector_store import VectorStoreManager, build_vector_store
|
| 68 |
+
from rag_chain import RAGChain, create_llm, PDFRAGPipeline
|
| 69 |
+
|
| 70 |
+
# 将内置分割器注入到 mock 模块
|
| 71 |
+
import sys as _sys
|
| 72 |
+
_lts = _sys.modules.get("langchain_text_splitters")
|
| 73 |
+
if _lts is not None:
|
| 74 |
+
_lts.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter
|
| 75 |
+
|
| 76 |
+
from loguru import logger
|
| 77 |
+
|
| 78 |
+
# ============================================================
|
| 79 |
+
# Banner
|
| 80 |
+
# ============================================================
|
| 81 |
+
|
| 82 |
+
BANNER = r"""
|
| 83 |
+
┌──────────────────────────────────────────────────────┐
|
| 84 |
+
│ 📄 PDF OCR 智能问答系统 │
|
| 85 |
+
│ │
|
| 86 |
+
│ OCR: PaddleOCR-VL-1.5 (本地) │
|
| 87 |
+
│ 嵌入: {emb_model} │
|
| 88 |
+
│ LLM: {llm_model} │
|
| 89 |
+
│ 向量库: {vec_store} │
|
| 90 |
+
└──────────────────────────────────────────────────────┘
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def print_banner():
|
| 95 |
+
emb_name = config.EMBEDDING_MODEL_NAME
|
| 96 |
+
llm_name = config.LLM_MODEL_NAME
|
| 97 |
+
vs = config.VECTOR_STORE_TYPE
|
| 98 |
+
# 截断过长的模型名
|
| 99 |
+
if len(emb_name) > 35:
|
| 100 |
+
emb_name = emb_name[:32] + "..."
|
| 101 |
+
if len(llm_name) > 35:
|
| 102 |
+
llm_name = llm_name[:32] + "..."
|
| 103 |
+
print(BANNER.format(emb_model=emb_name, llm_model=llm_name, vec_store=vs))
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ============================================================
|
| 107 |
+
# 步骤函数
|
| 108 |
+
# ============================================================
|
| 109 |
+
|
| 110 |
+
def _save_documents(docs: list, path: Path, label: str = "文档"):
|
| 111 |
+
"""将 LangChain Document 列表保存为 JSON"""
|
| 112 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 113 |
+
data = []
|
| 114 |
+
for doc in docs:
|
| 115 |
+
data.append({
|
| 116 |
+
"page_content": doc.page_content,
|
| 117 |
+
"metadata": {k: v for k, v in doc.metadata.items()
|
| 118 |
+
if isinstance(v, (str, int, float, bool, type(None)))}
|
| 119 |
+
})
|
| 120 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 121 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
| 122 |
+
print(f" 💾 {label}已保存: {path} ({len(data)} 条)")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def step_ocr(file_paths: List[str], output_dir: Optional[Path] = None) -> list:
|
| 126 |
+
"""Step 1: OCR 识别所有文件, 全部结果合并保存到一个文件"""
|
| 127 |
+
all_docs = []
|
| 128 |
+
for fp in file_paths:
|
| 129 |
+
fp = Path(fp)
|
| 130 |
+
if not fp.exists():
|
| 131 |
+
logger.error(f"文件不存在: {fp}")
|
| 132 |
+
continue
|
| 133 |
+
suffix = fp.suffix.lower()
|
| 134 |
+
if suffix not in config.SUPPORTED_FORMATS:
|
| 135 |
+
logger.warning(f"跳过不支持格式: {fp} (支持: {config.SUPPORTED_FORMATS})")
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
icon = "📄" if suffix == ".pdf" else "🖼️"
|
| 139 |
+
print(f" {icon} 正在识别: {fp.name} ...", end=" ", flush=True)
|
| 140 |
+
t0 = time.time()
|
| 141 |
+
loader = PaddleOCRLoader(str(fp), verbose=True)
|
| 142 |
+
docs = loader.load()
|
| 143 |
+
elapsed = time.time() - t0
|
| 144 |
+
print(f"{len(docs)} 页/文档 ({elapsed:.1f}s)")
|
| 145 |
+
all_docs.extend(docs)
|
| 146 |
+
|
| 147 |
+
# 所有文件识别完后统一保存
|
| 148 |
+
if output_dir and all_docs:
|
| 149 |
+
save_path = output_dir / "ocr_results.json"
|
| 150 |
+
_save_documents(all_docs, save_path, "OCR结果 ")
|
| 151 |
+
|
| 152 |
+
return all_docs
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def step_process(
|
| 156 |
+
documents: list, chunk_size: int, chunk_overlap: int,
|
| 157 |
+
output_dir: Optional[Path] = None
|
| 158 |
+
) -> list:
|
| 159 |
+
"""Step 2: 文本清洗 + 分割, 全部结果合并保存到一个文件"""
|
| 160 |
+
print(f" ✂️ 正在分割: {len(documents)} 个文档 ...", end=" ", flush=True)
|
| 161 |
+
t0 = time.time()
|
| 162 |
+
pipeline = TextProcessingPipeline(
|
| 163 |
+
chunk_size=chunk_size,
|
| 164 |
+
chunk_overlap=chunk_overlap,
|
| 165 |
+
)
|
| 166 |
+
chunks = pipeline.process(documents)
|
| 167 |
+
elapsed = time.time() - t0
|
| 168 |
+
print(f"→ {len(chunks)} 个文本块 ({elapsed:.1f}s)")
|
| 169 |
+
|
| 170 |
+
if output_dir and chunks:
|
| 171 |
+
save_path = output_dir / "chunks.json"
|
| 172 |
+
_save_documents(chunks, save_path, "分块结果 ")
|
| 173 |
+
|
| 174 |
+
return chunks
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def step_embed(chunks: list) -> VectorStoreManager:
|
| 178 |
+
"""Step 3: 向量嵌入 + 入库"""
|
| 179 |
+
print(f" 🧠 正在向量化: {len(chunks)} 个文本块 ...", end=" ", flush=True)
|
| 180 |
+
t0 = time.time()
|
| 181 |
+
manager = build_vector_store(chunks, clear_existing=True)
|
| 182 |
+
elapsed = time.time() - t0
|
| 183 |
+
print(f"完成 ({elapsed:.1f}s)")
|
| 184 |
+
return manager
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def step_rag(manager: VectorStoreManager):
|
| 188 |
+
"""Step 4: 初始化 RAG 链"""
|
| 189 |
+
llm = create_llm()
|
| 190 |
+
chain = RAGChain(vector_store_manager=manager, llm=llm)
|
| 191 |
+
return chain
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
# ============================================================
|
| 195 |
+
# 核心流程
|
| 196 |
+
# ============================================================
|
| 197 |
+
|
| 198 |
+
def run_ingest(
|
| 199 |
+
file_paths: List[str],
|
| 200 |
+
chunk_size: int = config.CHUNK_SIZE,
|
| 201 |
+
chunk_overlap: int = config.CHUNK_OVERLAP,
|
| 202 |
+
clear: bool = True,
|
| 203 |
+
output_dir: Optional[Path] = None,
|
| 204 |
+
) -> VectorStoreManager:
|
| 205 |
+
"""完整入库流程: OCR → 处理 → 嵌入 → 入库"""
|
| 206 |
+
print("\n" + "─" * 55)
|
| 207 |
+
print(" 📥 阶段 1: 文档入库")
|
| 208 |
+
print("─" * 55)
|
| 209 |
+
|
| 210 |
+
# Step 1: OCR
|
| 211 |
+
t_start = time.time()
|
| 212 |
+
documents = step_ocr(file_paths, output_dir=output_dir)
|
| 213 |
+
if not documents:
|
| 214 |
+
logger.error("未识别到任何文本内容, 请检查文件是否包含可读文字")
|
| 215 |
+
sys.exit(1)
|
| 216 |
+
print(f" 总计: {len(documents)} 个原始文档页")
|
| 217 |
+
|
| 218 |
+
# Step 2: 处理
|
| 219 |
+
chunks = step_process(documents, chunk_size, chunk_overlap,
|
| 220 |
+
output_dir=output_dir)
|
| 221 |
+
|
| 222 |
+
# Step 3: 嵌入入库
|
| 223 |
+
manager = step_embed(chunks)
|
| 224 |
+
|
| 225 |
+
total_time = time.time() - t_start
|
| 226 |
+
print(f"\n ✅ 入库完成 (总耗时 {total_time:.1f}s)")
|
| 227 |
+
print(f" 文档: {len(documents)} 页 → {len(chunks)} 个文本块")
|
| 228 |
+
print(f" 向量维度: {config.EMBEDDING_MODEL_NAME}")
|
| 229 |
+
print(f" 存储: {config.VECTOR_STORE_TYPE} @ {config.VECTOR_DB_DIR}")
|
| 230 |
+
|
| 231 |
+
return manager
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def run_qa(chain: RAGChain, question: str, show_sources: bool = False):
|
| 235 |
+
"""执行单次问答"""
|
| 236 |
+
print("\n" + "─" * 55)
|
| 237 |
+
print(f" ❓ 问题: {question}")
|
| 238 |
+
print("─" * 55)
|
| 239 |
+
|
| 240 |
+
t0 = time.time()
|
| 241 |
+
result = chain.query(question)
|
| 242 |
+
elapsed = time.time() - t0
|
| 243 |
+
|
| 244 |
+
print(f"\n 🤖 回答 ({elapsed:.1f}s):")
|
| 245 |
+
print("─" * 55)
|
| 246 |
+
print(result["answer"])
|
| 247 |
+
|
| 248 |
+
if show_sources:
|
| 249 |
+
print(f"\n 📚 参考来源 ({len(result['sources'])} 条):")
|
| 250 |
+
print("─" * 55)
|
| 251 |
+
for src in result["sources"]:
|
| 252 |
+
print(f" [{src['rank']}] {src['document']} | 第{src['page']}页 "
|
| 253 |
+
f"| {src['content_type']}")
|
| 254 |
+
print(f" {src['content'][:120]}...")
|
| 255 |
+
|
| 256 |
+
return result
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def run_repl(chain: RAGChain):
|
| 260 |
+
"""交互式问答 REPL"""
|
| 261 |
+
print("\n" + "─" * 55)
|
| 262 |
+
print(" 💬 交互问答模式")
|
| 263 |
+
print("─" * 55)
|
| 264 |
+
print(" 输入问题后回车, 输入 :s 切换来源显示")
|
| 265 |
+
print(" 输入 :q 退出, :c 清屏, :h 帮助")
|
| 266 |
+
print("─" * 55)
|
| 267 |
+
|
| 268 |
+
chat_history = []
|
| 269 |
+
show_sources = False
|
| 270 |
+
|
| 271 |
+
while True:
|
| 272 |
+
try:
|
| 273 |
+
user_input = input("\n 🔍 > ").strip()
|
| 274 |
+
except (EOFError, KeyboardInterrupt):
|
| 275 |
+
print("\n 再见! 👋")
|
| 276 |
+
break
|
| 277 |
+
|
| 278 |
+
if not user_input:
|
| 279 |
+
continue
|
| 280 |
+
|
| 281 |
+
# 命令处理
|
| 282 |
+
if user_input.startswith(":"):
|
| 283 |
+
cmd = user_input[1:].strip().lower()
|
| 284 |
+
if cmd in ("q", "quit", "exit"):
|
| 285 |
+
print(" 再见! 👋")
|
| 286 |
+
break
|
| 287 |
+
elif cmd == "s":
|
| 288 |
+
show_sources = not show_sources
|
| 289 |
+
print(f" 来源显示: {'开启' if show_sources else '关闭'}")
|
| 290 |
+
elif cmd == "c":
|
| 291 |
+
os.system("clear" if os.name != "nt" else "cls")
|
| 292 |
+
elif cmd == "h":
|
| 293 |
+
print(" 命令: :q 退出 | :s 切换来源 | :c 清屏 | :h 帮助")
|
| 294 |
+
else:
|
| 295 |
+
print(f" 未知命令: {user_input}")
|
| 296 |
+
continue
|
| 297 |
+
|
| 298 |
+
# 问答
|
| 299 |
+
t0 = time.time()
|
| 300 |
+
result = chain.query_with_history(user_input, chat_history)
|
| 301 |
+
elapsed = time.time() - t0
|
| 302 |
+
|
| 303 |
+
print(f"\n 🤖 ({elapsed:.1f}s):")
|
| 304 |
+
print(f" {result['answer']}")
|
| 305 |
+
|
| 306 |
+
if show_sources:
|
| 307 |
+
print(f"\n 📚 来源 ({len(result['sources'])} 条):")
|
| 308 |
+
for src in result["sources"]:
|
| 309 |
+
print(f" [{src['rank']}] {src['document']} "
|
| 310 |
+
f"第{src['page']}页 | {src['content_type']}")
|
| 311 |
+
|
| 312 |
+
chat_history.append({"role": "user", "content": user_input})
|
| 313 |
+
chat_history.append({"role": "assistant", "content": result["answer"]})
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ============================================================
|
| 317 |
+
# API 连通性检查
|
| 318 |
+
# ============================================================
|
| 319 |
+
|
| 320 |
+
def check_apis() -> bool:
|
| 321 |
+
"""检查 Embedding API 和 LLM API 是否可达"""
|
| 322 |
+
import urllib.request
|
| 323 |
+
|
| 324 |
+
all_ok = True
|
| 325 |
+
|
| 326 |
+
# 检查 Embedding API
|
| 327 |
+
emb_url = config.EMBEDDING_API_BASE.rstrip("/")
|
| 328 |
+
try:
|
| 329 |
+
req = urllib.request.Request(f"{emb_url}/models", method="HEAD")
|
| 330 |
+
urllib.request.urlopen(req, timeout=5)
|
| 331 |
+
print(f" ✅ Embedding API: {emb_url}")
|
| 332 |
+
except Exception as e:
|
| 333 |
+
print(f" ⚠️ Embedding API: {emb_url} — {e}")
|
| 334 |
+
all_ok = False
|
| 335 |
+
|
| 336 |
+
# 检查 LLM API
|
| 337 |
+
llm_url = config.LLM_API_BASE.rstrip("/")
|
| 338 |
+
try:
|
| 339 |
+
req = urllib.request.Request(f"{llm_url}/models", method="HEAD")
|
| 340 |
+
urllib.request.urlopen(req, timeout=5)
|
| 341 |
+
print(f" ✅ LLM API: {llm_url}")
|
| 342 |
+
except Exception as e:
|
| 343 |
+
print(f" ⚠️ LLM API: {llm_url} — {e}")
|
| 344 |
+
all_ok = False
|
| 345 |
+
|
| 346 |
+
return all_ok
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
# ============================================================
|
| 350 |
+
# 主入口
|
| 351 |
+
# ============================================================
|
| 352 |
+
|
| 353 |
+
def main():
|
| 354 |
+
parser = argparse.ArgumentParser(
|
| 355 |
+
description="PDF OCR 智能问答系统 — 端到端运行脚本",
|
| 356 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 357 |
+
epilog="""
|
| 358 |
+
示例:
|
| 359 |
+
python run.py -f document.pdf # 交互问答
|
| 360 |
+
python run.py -f doc.pdf -q "主要内容?" # 单次问答
|
| 361 |
+
python run.py -f a.pdf b.png --clear # 批量处理
|
| 362 |
+
python run.py --load # 加载已有向量库
|
| 363 |
+
""",
|
| 364 |
+
)
|
| 365 |
+
parser.add_argument(
|
| 366 |
+
"-f", "--files", nargs="+",
|
| 367 |
+
default=["/data/huangjie/Project/dProject/pdfocr/过滤网modify.pdf",
|
| 368 |
+
"/data/huangjie/Project/dProject/pdfocr/videoagent.png",
|
| 369 |
+
"/data/huangjie/Project/dProject/pdfocr/biaozhun.jpg"],
|
| 370 |
+
help="要处理的文档路径 (PDF/PNG/JPG/BMP/TIF)",
|
| 371 |
+
)
|
| 372 |
+
parser.add_argument(
|
| 373 |
+
"-q", "--question",
|
| 374 |
+
help="单次问答 (不进入交互模式)",
|
| 375 |
+
)
|
| 376 |
+
parser.add_argument(
|
| 377 |
+
"--load", action="store_true",
|
| 378 |
+
help="加载已有向量库, 跳过 OCR 处理",
|
| 379 |
+
)
|
| 380 |
+
parser.add_argument(
|
| 381 |
+
"--clear", action="store_true",
|
| 382 |
+
help="清空旧向量库数据后重新处理",
|
| 383 |
+
)
|
| 384 |
+
parser.add_argument(
|
| 385 |
+
"--chunk-size", type=int, default=config.CHUNK_SIZE,
|
| 386 |
+
help=f"文本块大小 (默认: {config.CHUNK_SIZE})",
|
| 387 |
+
)
|
| 388 |
+
parser.add_argument(
|
| 389 |
+
"--chunk-overlap", type=int, default=config.CHUNK_OVERLAP,
|
| 390 |
+
help=f"块间重叠字符数 (默认: {config.CHUNK_OVERLAP})",
|
| 391 |
+
)
|
| 392 |
+
parser.add_argument(
|
| 393 |
+
"--show-sources", action="store_true",
|
| 394 |
+
help="在回答中显示参考来源",
|
| 395 |
+
)
|
| 396 |
+
parser.add_argument(
|
| 397 |
+
"--top-k", type=int, default=config.RETRIEVAL_TOP_K,
|
| 398 |
+
help=f"检索返回文档数 (默认: {config.RETRIEVAL_TOP_K})",
|
| 399 |
+
)
|
| 400 |
+
parser.add_argument(
|
| 401 |
+
"--skip-api-check", action="store_true",
|
| 402 |
+
help="跳过 API 连通性检查",
|
| 403 |
+
)
|
| 404 |
+
parser.add_argument(
|
| 405 |
+
"--output-dir", type=str, default=None,
|
| 406 |
+
help=f"中间结果保存目录 (默认: {config.OCR_OUTPUT_DIR})",
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
args = parser.parse_args()
|
| 410 |
+
|
| 411 |
+
# Banner
|
| 412 |
+
print_banner()
|
| 413 |
+
|
| 414 |
+
# API 检查
|
| 415 |
+
if not args.skip_api_check:
|
| 416 |
+
print(" 🔌 API 连通性检查:")
|
| 417 |
+
check_apis()
|
| 418 |
+
print()
|
| 419 |
+
|
| 420 |
+
# 模式判断
|
| 421 |
+
if args.load:
|
| 422 |
+
# 加载已有向量库
|
| 423 |
+
print(" 📂 加载已有向量库...")
|
| 424 |
+
manager = VectorStoreManager(store_type=config.VECTOR_STORE_TYPE)
|
| 425 |
+
count = manager.get_document_count()
|
| 426 |
+
if count == 0:
|
| 427 |
+
logger.error("向量库为空! 请先用 -f 指定文件进行入库")
|
| 428 |
+
sys.exit(1)
|
| 429 |
+
print(f" ✅ 已加载: {count} 个文档块")
|
| 430 |
+
elif args.files:
|
| 431 |
+
# 处理文件
|
| 432 |
+
output_dir = Path(args.output_dir) if args.output_dir else config.OCR_OUTPUT_DIR
|
| 433 |
+
manager = run_ingest(
|
| 434 |
+
args.files,
|
| 435 |
+
chunk_size=args.chunk_size,
|
| 436 |
+
chunk_overlap=args.chunk_overlap,
|
| 437 |
+
clear=args.clear,
|
| 438 |
+
output_dir=output_dir,
|
| 439 |
+
)
|
| 440 |
+
else:
|
| 441 |
+
parser.print_help()
|
| 442 |
+
print("\n ❌ 请指定 -f/--files 或 --load")
|
| 443 |
+
sys.exit(1)
|
| 444 |
+
|
| 445 |
+
# 初始化 RAG 链
|
| 446 |
+
print("\n" + "─" * 55)
|
| 447 |
+
print(" 🔗 阶段 2: 初始化 RAG 问答引擎")
|
| 448 |
+
print("─" * 55)
|
| 449 |
+
llm = create_llm()
|
| 450 |
+
chain = RAGChain(
|
| 451 |
+
vector_store_manager=manager,
|
| 452 |
+
llm=llm,
|
| 453 |
+
top_k=args.top_k,
|
| 454 |
+
)
|
| 455 |
+
print(f" ✅ RAG 引擎就绪 (LLM={config.LLM_MODEL_NAME})")
|
| 456 |
+
|
| 457 |
+
# 问答
|
| 458 |
+
if args.question:
|
| 459 |
+
run_qa(chain, args.question, show_sources=args.show_sources)
|
| 460 |
+
else:
|
| 461 |
+
run_repl(chain)
|
| 462 |
+
|
| 463 |
+
|
| 464 |
+
if __name__ == "__main__":
|
| 465 |
+
main()
|
static/index.html
ADDED
|
@@ -0,0 +1,637 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="zh-CN">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>OCR RAG — 智能问答系统</title>
|
| 7 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 8 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 9 |
+
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 10 |
+
<style>
|
| 11 |
+
:root {
|
| 12 |
+
--bg-root: #07090f;
|
| 13 |
+
--bg-surface: #0c0f17;
|
| 14 |
+
--bg-elevated: #111620;
|
| 15 |
+
--bg-overlay: #181d29;
|
| 16 |
+
--border-default: #1e2533;
|
| 17 |
+
--border-active: #2a3347;
|
| 18 |
+
--text-primary: #e4e7ee;
|
| 19 |
+
--text-secondary: #8b92a3;
|
| 20 |
+
--text-muted: #545b6d;
|
| 21 |
+
--accent-amber: #e8a840;
|
| 22 |
+
--accent-amber-dim: rgba(232,168,64,0.12);
|
| 23 |
+
--accent-amber-glow: rgba(232,168,64,0.25);
|
| 24 |
+
--accent-steel: #7eb8da;
|
| 25 |
+
--accent-steel-dim: rgba(126,184,218,0.1);
|
| 26 |
+
--accent-green: #4db88d;
|
| 27 |
+
--accent-red: #e0556a;
|
| 28 |
+
--radius-sm: 6px;
|
| 29 |
+
--radius-md: 10px;
|
| 30 |
+
--radius-lg: 14px;
|
| 31 |
+
--font-body: 'Outfit', system-ui, -apple-system, sans-serif;
|
| 32 |
+
--font-mono: 'JetBrains Mono', 'SF Mono', monospace;
|
| 33 |
+
--transition-smooth: 0.25s cubic-bezier(0.22,0.61,0.36,1);
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
|
| 37 |
+
html,body{height:100%;background:var(--bg-root);color:var(--text-primary);font-family:var(--font-body);font-weight:400;line-height:1.6;overflow:hidden}
|
| 38 |
+
body::before{content:'';position:fixed;inset:0;background:radial-gradient(ellipse 60% 50% at 20% 50%, rgba(126,184,218,0.04) 0%,transparent 70%),radial-gradient(ellipse 50% 60% at 85% 40%, rgba(232,168,64,0.03) 0%,transparent 70%);pointer-events:none;z-index:0}
|
| 39 |
+
body::after{content:'';position:fixed;inset:0;background-image:radial-gradient(circle, rgba(255,255,255,0.025) 1px, transparent 1px);background-size:28px 28px;pointer-events:none;z-index:0}
|
| 40 |
+
#app{position:relative;z-index:1;display:flex;height:100vh;width:100vw}
|
| 41 |
+
|
| 42 |
+
/* ── Sidebar ── */
|
| 43 |
+
#sidebar{width:340px;min-width:340px;background:var(--bg-surface);border-right:1px solid var(--border-default);display:flex;flex-direction:column;overflow-y:auto;overflow-x:hidden;z-index:2}
|
| 44 |
+
.sidebar-brand{padding:24px 24px 20px;border-bottom:1px solid var(--border-default)}
|
| 45 |
+
.sidebar-brand .logo{display:flex;align-items:center;gap:10px;text-decoration:none;color:inherit}
|
| 46 |
+
.sidebar-brand .logo-icon{width:34px;height:34px;background:linear-gradient(135deg,var(--accent-amber),#d4952a);border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-size:18px;color:#0a0d14;font-weight:700}
|
| 47 |
+
.sidebar-brand h1{font-family:var(--font-mono);font-size:15px;font-weight:600;letter-spacing:-0.02em;color:var(--text-primary);line-height:1.2}
|
| 48 |
+
.sidebar-brand .subtitle{font-size:11px;color:var(--text-muted);font-family:var(--font-mono);letter-spacing:0.04em;text-transform:uppercase}
|
| 49 |
+
.sidebar-section{padding:20px 24px;border-bottom:1px solid var(--border-default)}
|
| 50 |
+
.sidebar-section-header{display:flex;align-items:center;gap:8px;margin-bottom:14px}
|
| 51 |
+
.sidebar-section-header .dot{width:7px;height:7px;border-radius:50%;background:var(--accent-amber);box-shadow:0 0 6px var(--accent-amber-glow)}
|
| 52 |
+
.sidebar-section-header span{font-family:var(--font-mono);font-size:11px;font-weight:500;letter-spacing:0.06em;text-transform:uppercase;color:var(--text-secondary)}
|
| 53 |
+
.sidebar-section-header .count-badge{font-family:var(--font-mono);font-size:10px;font-weight:600;background:var(--bg-elevated);border:1px solid var(--border-active);padding:2px 8px;border-radius:100px;color:var(--text-secondary);margin-left:auto}
|
| 54 |
+
|
| 55 |
+
.upload-zone{border:2px dashed var(--border-active);border-radius:var(--radius-md);padding:24px 20px;text-align:center;cursor:pointer;transition:all var(--transition-smooth);background:var(--bg-elevated)}
|
| 56 |
+
.upload-zone:hover,.upload-zone.drag-over{border-color:var(--accent-amber);background:var(--accent-amber-dim)}
|
| 57 |
+
.upload-zone .upload-icon{font-size:28px;margin-bottom:8px;opacity:0.7}
|
| 58 |
+
.upload-zone .upload-text{font-size:13px;color:var(--text-secondary);font-weight:500}
|
| 59 |
+
.upload-zone .upload-hint{font-size:11px;color:var(--text-muted);margin-top:4px;font-family:var(--font-mono)}
|
| 60 |
+
|
| 61 |
+
.file-queue{margin-top:10px;max-height:160px;overflow-y:auto}
|
| 62 |
+
.file-queue-item{display:flex;align-items:center;gap:8px;padding:8px 10px;border-radius:var(--radius-sm);font-size:12px;margin-top:4px;background:var(--bg-elevated);border:1px solid var(--border-default);transition:all var(--transition-smooth)}
|
| 63 |
+
.file-queue-item:hover{border-color:var(--border-active)}
|
| 64 |
+
.file-queue-item .fq-icon{font-size:15px;flex-shrink:0}
|
| 65 |
+
.file-queue-item .fq-info{flex:1;min-width:0}
|
| 66 |
+
.file-queue-item .fq-name{font-weight:500;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;font-size:12px}
|
| 67 |
+
.file-queue-item .fq-meta{font-family:var(--font-mono);font-size:10px;color:var(--text-muted)}
|
| 68 |
+
.file-queue-item .fq-remove{width:22px;height:22px;flex-shrink:0;border-radius:50%;border:1px solid var(--border-default);background:transparent;color:var(--text-muted);cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:14px;line-height:1;transition:all var(--transition-smooth)}
|
| 69 |
+
.file-queue-item .fq-remove:hover{border-color:var(--accent-red);color:var(--accent-red);background:rgba(224,85,106,0.08)}
|
| 70 |
+
|
| 71 |
+
.btn{display:inline-flex;align-items:center;justify-content:center;gap:6px;border:none;border-radius:var(--radius-sm);font-family:var(--font-body);font-size:13px;font-weight:500;cursor:pointer;transition:all var(--transition-smooth);padding:10px 18px;white-space:nowrap}
|
| 72 |
+
.btn-primary{width:100%;background:linear-gradient(135deg,var(--accent-amber),#d4952a);color:#0a0d14;font-weight:600;font-size:14px;padding:12px 24px;letter-spacing:0.02em}
|
| 73 |
+
.btn-primary:hover{filter:brightness(1.1);transform:translateY(-1px)}
|
| 74 |
+
.btn-primary:active{transform:translateY(0)}
|
| 75 |
+
.btn-primary:disabled{opacity:0.4;cursor:not-allowed;filter:none;transform:none}
|
| 76 |
+
.btn-sm{padding:6px 14px;font-size:12px;border-radius:var(--radius-sm);width:auto}
|
| 77 |
+
|
| 78 |
+
.status-row{display:flex;align-items:center;gap:8px;padding:10px 14px;border-radius:var(--radius-sm);background:var(--bg-elevated);margin-top:8px;font-size:12px}
|
| 79 |
+
.status-dot{width:8px;height:8px;border-radius:50%;flex-shrink:0}
|
| 80 |
+
.status-dot.idle{background:var(--text-muted)}
|
| 81 |
+
.status-dot.processing{background:var(--accent-amber);animation:pulse 1.2s ease-in-out infinite}
|
| 82 |
+
.status-dot.ready{background:var(--accent-green);box-shadow:0 0 6px rgba(77,184,141,0.4)}
|
| 83 |
+
.status-dot.error{background:var(--accent-red)}
|
| 84 |
+
@keyframes pulse{0%,100%{opacity:1;box-shadow:0 0 4px var(--accent-amber-glow)}50%{opacity:0.4;box-shadow:0 0 12px var(--accent-amber-glow)}}
|
| 85 |
+
|
| 86 |
+
.process-log{margin-top:10px;background:var(--bg-root);border-radius:var(--radius-sm);padding:12px;font-family:var(--font-mono);font-size:11px;color:var(--text-muted);max-height:140px;overflow-y:auto;line-height:1.7;display:none}
|
| 87 |
+
.process-log.visible{display:block}
|
| 88 |
+
.process-log .log-entry{opacity:0;animation:logReveal 0.3s ease forwards}
|
| 89 |
+
.process-log .log-entry:nth-child(1){animation-delay:0.05s}
|
| 90 |
+
.process-log .log-entry:nth-child(2){animation-delay:0.15s}
|
| 91 |
+
.process-log .log-entry:nth-child(3){animation-delay:0.25s}
|
| 92 |
+
.process-log .log-entry:nth-child(4){animation-delay:0.35s}
|
| 93 |
+
.process-log .log-entry:nth-child(5){animation-delay:0.45s}
|
| 94 |
+
@keyframes logReveal{from{opacity:0;transform:translateX(-8px)}to{opacity:1;transform:translateX(0)}}
|
| 95 |
+
|
| 96 |
+
/* ── Sidebar file list ── */
|
| 97 |
+
.sb-pf-item{display:flex;align-items:center;gap:6px;padding:7px 10px;border-radius:var(--radius-sm);margin-top:3px;background:var(--bg-elevated);border:1px solid transparent;cursor:pointer;transition:all var(--transition-smooth)}
|
| 98 |
+
.sb-pf-item:hover{border-color:var(--border-active)}
|
| 99 |
+
.sb-pf-item.selected{border-color:var(--accent-amber);background:var(--accent-amber-dim)}
|
| 100 |
+
.sb-pf-item .sb-icon{font-size:13px;flex-shrink:0;opacity:0.7}
|
| 101 |
+
.sb-pf-item .sb-name{font-size:11px;font-weight:500;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;flex:1}
|
| 102 |
+
.sb-pf-item .sb-meta{font-family:var(--font-mono);font-size:9px;color:var(--text-muted);white-space:nowrap}
|
| 103 |
+
.sb-pf-item .sb-del{width:18px;height:18px;border-radius:50%;border:none;background:transparent;color:var(--text-muted);cursor:pointer;font-size:11px;display:none;align-items:center;justify-content:center;flex-shrink:0;transition:all var(--transition-smooth)}
|
| 104 |
+
.sb-pf-item:hover .sb-del{display:flex}
|
| 105 |
+
.sb-pf-item .sb-del:hover{background:rgba(224,85,106,0.12);color:var(--accent-red)}
|
| 106 |
+
.sb-pf-empty{font-size:11px;color:var(--text-muted);text-align:center;padding:16px 0}
|
| 107 |
+
|
| 108 |
+
/* ── Main ── */
|
| 109 |
+
#main{flex:1;display:flex;flex-direction:column;min-width:0;background:var(--bg-root)}
|
| 110 |
+
.tab-nav{display:flex;gap:0;border-bottom:1px solid var(--border-default);padding:0 20px;background:var(--bg-surface)}
|
| 111 |
+
.tab-btn{padding:14px 22px;background:none;border:none;color:var(--text-muted);font-family:var(--font-mono);font-size:12px;font-weight:500;cursor:pointer;letter-spacing:0.04em;position:relative;transition:color var(--transition-smooth)}
|
| 112 |
+
.tab-btn:hover{color:var(--text-secondary)}
|
| 113 |
+
.tab-btn.active{color:var(--accent-amber)}
|
| 114 |
+
.tab-btn.active::after{content:'';position:absolute;bottom:-1px;left:0;right:0;height:2px;background:var(--accent-amber);box-shadow:0 0 8px var(--accent-amber-glow)}
|
| 115 |
+
.tab-btn .tab-badge{font-size:10px;background:var(--accent-amber-dim);color:var(--accent-amber);padding:1px 7px;border-radius:100px;margin-left:6px}
|
| 116 |
+
.tab-panel{display:none;flex:1;overflow:hidden}
|
| 117 |
+
.tab-panel.active{display:flex;flex-direction:column}
|
| 118 |
+
|
| 119 |
+
/* ── Chat ── */
|
| 120 |
+
#chat-panel{display:none;flex:1;flex-direction:column;overflow:hidden}
|
| 121 |
+
#chat-panel.active{display:flex}
|
| 122 |
+
.chat-messages{flex:1;overflow-y:auto;padding:24px 28px;display:flex;flex-direction:column;gap:18px}
|
| 123 |
+
.chat-empty{flex:1;display:flex;flex-direction:column;align-items:center;justify-content:center;color:var(--text-muted);text-align:center;gap:12px}
|
| 124 |
+
.chat-empty .empty-icon{font-size:48px;opacity:0.3}
|
| 125 |
+
.chat-empty .empty-title{font-size:18px;font-weight:500;color:var(--text-secondary)}
|
| 126 |
+
.chat-empty .empty-desc{font-size:13px;max-width:400px;line-height:1.6}
|
| 127 |
+
.quick-prompts{display:flex;flex-wrap:wrap;gap:8px;justify-content:center;margin-top:8px}
|
| 128 |
+
.quick-prompt{padding:6px 14px;border-radius:100px;font-size:12px;font-weight:500;cursor:pointer;border:1px solid var(--border-active);background:var(--bg-elevated);color:var(--text-secondary);transition:all var(--transition-smooth);white-space:nowrap}
|
| 129 |
+
.quick-prompt:hover{border-color:var(--accent-amber);color:var(--accent-amber);background:var(--accent-amber-dim)}
|
| 130 |
+
.message{display:flex;gap:12px;animation:msgIn 0.35s cubic-bezier(0.22,0.61,0.36,1)}
|
| 131 |
+
@keyframes msgIn{from{opacity:0;transform:translateY(12px)}to{opacity:1;transform:translateY(0)}}
|
| 132 |
+
.message .msg-avatar{width:32px;height:32px;border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-size:15px;flex-shrink:0;font-weight:600}
|
| 133 |
+
.message.user .msg-avatar{background:var(--accent-steel-dim);color:var(--accent-steel);font-family:var(--font-mono);font-size:13px}
|
| 134 |
+
.message.assistant .msg-avatar{background:var(--accent-amber-dim);color:var(--accent-amber)}
|
| 135 |
+
.message .msg-bubble{max-width:75%;padding:12px 16px;border-radius:var(--radius-md);font-size:14px;line-height:1.65}
|
| 136 |
+
.message.user .msg-bubble{background:var(--bg-overlay);color:var(--text-primary);border:1px solid var(--border-default)}
|
| 137 |
+
.message.assistant .msg-bubble{background:var(--bg-elevated);color:var(--text-primary);border:1px solid var(--border-active)}
|
| 138 |
+
.chat-input-area{padding:16px 24px 20px;border-top:1px solid var(--border-default);background:var(--bg-surface)}
|
| 139 |
+
.chat-input-row{display:flex;gap:10px;align-items:flex-end}
|
| 140 |
+
.chat-input-row textarea{flex:1;background:var(--bg-root);border:1px solid var(--border-active);border-radius:var(--radius-md);color:var(--text-primary);font-family:var(--font-body);font-size:14px;padding:12px 16px;resize:none;outline:none;min-height:46px;max-height:120px;line-height:1.5;transition:border-color var(--transition-smooth)}
|
| 141 |
+
.chat-input-row textarea:focus{border-color:var(--accent-amber);box-shadow:0 0 0 3px var(--accent-amber-dim)}
|
| 142 |
+
.chat-input-row textarea::placeholder{color:var(--text-muted)}
|
| 143 |
+
.chat-input-row .btn-send{width:46px;height:46px;border-radius:var(--radius-md);background:var(--accent-amber);border:none;color:#0a0d14;font-size:18px;cursor:pointer;transition:all var(--transition-smooth);display:flex;align-items:center;justify-content:center;flex-shrink:0}
|
| 144 |
+
.chat-input-row .btn-send:hover{filter:brightness(1.1)}
|
| 145 |
+
.chat-input-row .btn-send:disabled{opacity:0.3;cursor:not-allowed;filter:none}
|
| 146 |
+
|
| 147 |
+
/* ── Documents tab (merged Preview + Documents) ── */
|
| 148 |
+
#preview-panel{display:flex;flex-direction:column;overflow:hidden;flex:1}
|
| 149 |
+
|
| 150 |
+
/* File cards row */
|
| 151 |
+
.doc-cards-wrap{padding:16px 24px;border-bottom:1px solid var(--border-default);overflow-x:auto;flex-shrink:0}
|
| 152 |
+
.doc-cards-wrap h3{font-family:var(--font-mono);font-size:11px;font-weight:500;letter-spacing:0.06em;color:var(--text-muted);text-transform:uppercase;margin-bottom:12px;display:flex;align-items:center;gap:8px}
|
| 153 |
+
.doc-cards-wrap h3::before{content:'';width:7px;height:7px;border-radius:50%;background:var(--accent-amber);box-shadow:0 0 6px var(--accent-amber-glow)}
|
| 154 |
+
.doc-cards{display:flex;gap:12px;padding-bottom:4px}
|
| 155 |
+
.doc-card{flex:0 0 auto;width:220px;background:var(--bg-elevated);border:2px solid var(--border-default);border-radius:var(--radius-md);padding:16px;cursor:pointer;transition:all var(--transition-smooth);position:relative}
|
| 156 |
+
.doc-card:hover{border-color:var(--border-active);background:var(--bg-overlay)}
|
| 157 |
+
.doc-card.selected{border-color:var(--accent-amber);box-shadow:0 0 12px var(--accent-amber-dim);background:var(--bg-overlay)}
|
| 158 |
+
.doc-card .dc-icon{width:38px;height:38px;border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-family:var(--font-mono);font-size:12px;font-weight:700;margin-bottom:10px}
|
| 159 |
+
.doc-card .dc-icon.pdf{background:rgba(224,85,106,0.12);color:var(--accent-red)}
|
| 160 |
+
.doc-card .dc-icon.img{background:var(--accent-steel-dim);color:var(--accent-steel)}
|
| 161 |
+
.doc-card .dc-name{font-weight:600;font-size:13px;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;margin-bottom:8px;line-height:1.3}
|
| 162 |
+
.doc-card .dc-stats{display:flex;gap:10px;font-family:var(--font-mono);font-size:10px;color:var(--text-muted)}
|
| 163 |
+
.doc-card .dc-stats span{display:flex;align-items:center;gap:2px}
|
| 164 |
+
.doc-card .dc-stats .dc-val{color:var(--text-secondary)}
|
| 165 |
+
.doc-card .dc-delete{position:absolute;top:8px;right:8px;width:22px;height:22px;border-radius:50%;border:1px solid transparent;background:transparent;color:var(--text-muted);cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:12px;transition:all var(--transition-smooth);opacity:0}
|
| 166 |
+
.doc-card:hover .dc-delete{opacity:1;border-color:var(--border-default)}
|
| 167 |
+
.doc-card .dc-delete:hover{border-color:var(--accent-red);color:var(--accent-red);background:rgba(224,85,106,0.08)}
|
| 168 |
+
|
| 169 |
+
/* Preview pane */
|
| 170 |
+
.doc-preview-wrap{flex:1;display:flex;flex-direction:column;overflow:hidden}
|
| 171 |
+
.doc-preview-header{display:flex;align-items:center;gap:12px;padding:12px 24px;background:var(--bg-surface);border-bottom:1px solid var(--border-default);flex-shrink:0}
|
| 172 |
+
.doc-preview-header .dph-title{font-weight:600;font-size:14px;color:var(--text-primary)}
|
| 173 |
+
.doc-preview-header .dph-meta{font-family:var(--font-mono);font-size:11px;color:var(--text-muted);margin-left:auto}
|
| 174 |
+
.doc-preview-body{flex:1;overflow-y:auto;padding:24px 28px;font-family:var(--font-mono);font-size:13px;line-height:1.8;color:var(--text-secondary);white-space:pre-wrap;word-break:break-word}
|
| 175 |
+
.doc-preview-empty{flex:1;display:flex;align-items:center;justify-content:center;text-align:center;color:var(--text-muted)}
|
| 176 |
+
.doc-preview-empty .dpe-icon{font-size:48px;opacity:0.15;margin-bottom:10px}
|
| 177 |
+
.doc-preview-empty .dpe-text{font-size:14px}
|
| 178 |
+
.pv-page-header{color:var(--accent-amber);font-weight:600;font-size:12px;letter-spacing:0.04em;padding:8px 0;border-bottom:1px solid var(--border-default);margin:16px 0 12px}
|
| 179 |
+
.pv-page-header:first-child{margin-top:0}
|
| 180 |
+
|
| 181 |
+
.doc-empty-state{display:flex;flex-direction:column;align-items:center;justify-content:center;height:100%;text-align:center;color:var(--text-muted)}
|
| 182 |
+
.doc-empty-state .de-icon{font-size:56px;opacity:0.12;margin-bottom:12px}
|
| 183 |
+
.doc-empty-state .de-title{font-size:16px;color:var(--text-secondary);font-weight:500}
|
| 184 |
+
.doc-empty-state .de-desc{font-size:13px;margin-top:6px;max-width:360px}
|
| 185 |
+
|
| 186 |
+
/* ── Split preview ── */
|
| 187 |
+
.pv-split-wrap{flex:1;display:flex;flex-direction:column;overflow:hidden}
|
| 188 |
+
.pv-split-header{display:flex;align-items:center;gap:12px;padding:10px 24px;background:var(--bg-surface);border-bottom:1px solid var(--border-default);flex-shrink:0}
|
| 189 |
+
.pv-split-body{flex:1;display:flex;overflow:hidden}
|
| 190 |
+
.pv-left{flex:1;overflow:auto;background:var(--bg-root);display:flex;align-items:center;justify-content:center;min-width:0}
|
| 191 |
+
.pv-left iframe{width:100%;height:100%;border:none;background:#fff}
|
| 192 |
+
.pv-left img{max-width:100%;max-height:100%;object-fit:contain}
|
| 193 |
+
.pv-left-placeholder{color:var(--text-muted);font-size:13px;text-align:center}
|
| 194 |
+
.pv-divider{width:4px;background:var(--border-default);flex-shrink:0;cursor:col-resize;transition:background var(--transition-smooth)}
|
| 195 |
+
.pv-divider:hover{background:var(--accent-amber)}
|
| 196 |
+
.pv-right{flex:1;overflow-y:auto;padding:20px 24px;font-family:var(--font-mono);font-size:13px;line-height:1.8;color:var(--text-secondary);white-space:pre-wrap;word-break:break-word;min-width:0}
|
| 197 |
+
.pv-right-placeholder{text-align:center;color:var(--text-muted);font-size:13px;padding-top:60px}
|
| 198 |
+
|
| 199 |
+
/* ── Status ── */
|
| 200 |
+
#status-panel{padding:24px 28px;overflow-y:auto;flex:1}
|
| 201 |
+
.status-card{background:var(--bg-elevated);border:1px solid var(--border-default);border-radius:var(--radius-md);padding:20px;margin-bottom:16px}
|
| 202 |
+
.status-card h3{font-family:var(--font-mono);font-size:12px;font-weight:500;letter-spacing:0.06em;color:var(--text-muted);text-transform:uppercase;margin-bottom:14px;display:flex;align-items:center;gap:8px}
|
| 203 |
+
.status-card h3::before{content:'';width:8px;height:8px;border-radius:2px;background:var(--accent-steel)}
|
| 204 |
+
.status-card h3 .sc-btn{margin-left:auto;cursor:pointer}
|
| 205 |
+
.status-card h3 .sc-btn::before{content:none}
|
| 206 |
+
.model-stack{display:grid;grid-template-columns:1fr 1fr;gap:10px}
|
| 207 |
+
.model-item{background:var(--bg-surface);border-radius:var(--radius-sm);padding:12px 14px;border:1px solid var(--border-default)}
|
| 208 |
+
.model-item .model-label{font-size:10px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;margin-bottom:4px}
|
| 209 |
+
.model-item .model-value{font-size:13px;font-weight:600;color:var(--text-primary)}
|
| 210 |
+
.model-item .model-sub{font-family:var(--font-mono);font-size:10px;color:var(--text-muted);margin-top:2px;word-break:break-all}
|
| 211 |
+
.param-row{display:flex;justify-content:space-between;align-items:center;padding:8px 0;border-bottom:1px solid rgba(255,255,255,0.03);font-size:13px}
|
| 212 |
+
.param-row:last-child{border-bottom:none}
|
| 213 |
+
.param-row .param-label{color:var(--text-secondary)}
|
| 214 |
+
.param-row .param-value{font-family:var(--font-mono);font-size:12px;color:var(--accent-steel);font-weight:500}
|
| 215 |
+
.param-edit{display:flex;align-items:center;gap:8px}
|
| 216 |
+
.param-edit input[type="range"]{-webkit-appearance:none;width:120px;height:4px;border-radius:2px;background:var(--border-active);outline:none}
|
| 217 |
+
.param-edit input[type="range"]::-webkit-slider-thumb{-webkit-appearance:none;width:14px;height:14px;border-radius:50%;background:var(--accent-amber);cursor:pointer;border:2px solid var(--bg-root)}
|
| 218 |
+
.param-edit .pe-val{font-family:var(--font-mono);font-size:12px;color:var(--accent-amber);min-width:45px;text-align:right}
|
| 219 |
+
.param-edit input[type="number"]{width:70px;padding:6px 8px;background:var(--bg-root);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-primary);font-family:var(--font-mono);font-size:12px;outline:none;text-align:center}
|
| 220 |
+
.param-edit input[type="number"]:focus{border-color:var(--accent-amber)}
|
| 221 |
+
.config-section{margin-bottom:16px;padding-bottom:12px;border-bottom:1px solid var(--border-default)}
|
| 222 |
+
.config-section:last-child{border-bottom:none;margin-bottom:0}
|
| 223 |
+
.config-section h4{font-size:13px;font-weight:600;color:var(--text-secondary);margin-bottom:8px}
|
| 224 |
+
.form-group{margin-bottom:8px}
|
| 225 |
+
.form-group label{display:block;font-size:11px;font-weight:500;color:var(--text-muted);margin-bottom:4px;text-transform:uppercase;letter-spacing:0.05em}
|
| 226 |
+
.form-group input,.form-group select{width:100%;padding:8px 10px;background:var(--bg-root);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-primary);font-family:var(--font-mono);font-size:12px;outline:none;transition:var(--transition-smooth)}
|
| 227 |
+
.form-group input:focus,.form-group select:focus{border-color:var(--accent-amber);box-shadow:0 0 0 2px var(--accent-amber-dim)}
|
| 228 |
+
.form-group .input-row{display:flex;gap:6px}
|
| 229 |
+
.form-group .input-row input{flex:1}
|
| 230 |
+
.form-group .toggle-vis{width:34px;flex-shrink:0;background:var(--bg-elevated);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-muted);cursor:pointer;font-family:var(--font-mono);font-size:11px;display:flex;align-items:center;justify-content:center;transition:var(--transition-smooth)}
|
| 231 |
+
.form-group .toggle-vis:hover{border-color:var(--border-active);color:var(--text-secondary)}
|
| 232 |
+
.config-readonly{font-family:var(--font-mono);font-size:11px;color:var(--text-muted);padding:4px 0}
|
| 233 |
+
.sources-toggle{padding:8px 24px;font-size:11px;font-family:var(--font-mono);color:var(--text-muted);cursor:pointer;user-select:none;border-top:1px solid var(--border-default);background:var(--bg-surface);display:flex;align-items:center;gap:6px;transition:color var(--transition-smooth)}
|
| 234 |
+
.sources-toggle:hover{color:var(--text-secondary)}
|
| 235 |
+
.sources-content{background:var(--bg-elevated);border-top:1px solid var(--border-default);max-height:200px;overflow-y:auto;display:none;padding:12px 24px}
|
| 236 |
+
.sources-content.open{display:block}
|
| 237 |
+
.source-item{padding:8px 12px;border-left:2px solid var(--accent-steel);margin-bottom:8px;font-size:12px;background:var(--bg-surface);border-radius:0 var(--radius-sm) var(--radius-sm) 0}
|
| 238 |
+
.source-item .src-header{display:flex;gap:12px;font-family:var(--font-mono);font-size:10px;color:var(--accent-amber);margin-bottom:4px}
|
| 239 |
+
.source-item .src-excerpt{color:var(--text-secondary);font-size:12px;line-height:1.5}
|
| 240 |
+
.typing-indicator{display:flex;gap:5px;padding:4px 0}
|
| 241 |
+
.typing-indicator span{width:6px;height:6px;border-radius:50%;background:var(--text-muted);animation:typingBounce 1.2s ease-in-out infinite}
|
| 242 |
+
.typing-indicator span:nth-child(2){animation-delay:0.15s}
|
| 243 |
+
.typing-indicator span:nth-child(3){animation-delay:0.3s}
|
| 244 |
+
@keyframes typingBounce{0%,60%,100%{transform:translateY(0);opacity:0.4}30%{transform:translateY(-6px);opacity:1}}
|
| 245 |
+
::-webkit-scrollbar{width:5px}
|
| 246 |
+
::-webkit-scrollbar-track{background:transparent}
|
| 247 |
+
::-webkit-scrollbar-thumb{background:var(--border-active);border-radius:3px}
|
| 248 |
+
::-webkit-scrollbar-thumb:hover{background:var(--text-muted)}
|
| 249 |
+
.toast{position:fixed;bottom:24px;right:24px;background:var(--bg-overlay);border:1px solid var(--border-active);border-radius:var(--radius-md);padding:14px 20px;font-size:13px;z-index:100;opacity:0;transform:translateY(12px);transition:all 0.3s ease;pointer-events:none;max-width:360px}
|
| 250 |
+
.toast.show{opacity:1;transform:translateY(0)}
|
| 251 |
+
.toast.error{border-color:var(--accent-red)}
|
| 252 |
+
.toast.success{border-color:var(--accent-green)}
|
| 253 |
+
.progress-bar-wrap{height:3px;background:var(--border-default);border-radius:2px;margin-top:10px;overflow:hidden;display:none}
|
| 254 |
+
.progress-bar-wrap.active{display:block}
|
| 255 |
+
.progress-bar-fill{height:100%;background:linear-gradient(90deg,var(--accent-amber),#d4952a);border-radius:2px;width:0%;transition:width 0.3s ease}
|
| 256 |
+
@media(max-width:860px){
|
| 257 |
+
#app{flex-direction:column}
|
| 258 |
+
#sidebar{width:100%;min-width:100%;max-height:40vh;border-right:none;border-bottom:1px solid var(--border-default)}
|
| 259 |
+
.doc-card{width:180px}
|
| 260 |
+
.model-stack{grid-template-columns:1fr}
|
| 261 |
+
}
|
| 262 |
+
</style>
|
| 263 |
+
</head>
|
| 264 |
+
<body>
|
| 265 |
+
|
| 266 |
+
<div id="app">
|
| 267 |
+
<!-- ═══ SIDEBAR ═══ -->
|
| 268 |
+
<aside id="sidebar">
|
| 269 |
+
<div class="sidebar-brand">
|
| 270 |
+
<a class="logo" href="/"><div class="logo-icon">◈</div><div><h1>OCR RAG</h1><div class="subtitle">Intelligent Q&A System</div></div></a>
|
| 271 |
+
</div>
|
| 272 |
+
<div class="sidebar-section">
|
| 273 |
+
<div class="sidebar-section-header"><div class="dot"></div><span>Document Upload</span><span class="count-badge" id="queueCount">0</span></div>
|
| 274 |
+
<div class="upload-zone" id="uploadZone">
|
| 275 |
+
<div class="upload-icon">↓</div>
|
| 276 |
+
<div class="upload-text">Drop files here or click to browse</div>
|
| 277 |
+
<div class="upload-hint">PDF · PNG · JPG · BMP · TIF</div>
|
| 278 |
+
</div>
|
| 279 |
+
<input type="file" id="fileInput" accept=".pdf,.png,.jpg,.jpeg,.bmp,.tif,.tiff" multiple hidden>
|
| 280 |
+
<div class="file-queue" id="fileQueue"></div>
|
| 281 |
+
<button class="btn btn-primary" id="processBtn" disabled>→ Process Documents</button>
|
| 282 |
+
<div class="progress-bar-wrap" id="progressWrap"><div class="progress-bar-fill" id="progressFill"></div></div>
|
| 283 |
+
<div class="status-row">
|
| 284 |
+
<div class="status-dot idle" id="statusDot"></div>
|
| 285 |
+
<span id="statusText">Ready — upload files to begin</span>
|
| 286 |
+
</div>
|
| 287 |
+
<div class="process-log" id="processLog"></div>
|
| 288 |
+
</div>
|
| 289 |
+
<div class="sidebar-section" style="flex:1;overflow-y:auto" id="sidebarFileSection">
|
| 290 |
+
<div class="sidebar-section-header"><div class="dot"></div><span>Processed Files</span><span class="count-badge" id="sidebarPfCount">0</span></div>
|
| 291 |
+
<div id="sidebarFileList"><div class="sb-pf-empty">No files processed yet</div></div>
|
| 292 |
+
</div>
|
| 293 |
+
</aside>
|
| 294 |
+
|
| 295 |
+
<!-- ═══ MAIN ═══ -->
|
| 296 |
+
<main id="main">
|
| 297 |
+
<nav class="tab-nav">
|
| 298 |
+
<button class="tab-btn active" data-tab="chat">Chat</button>
|
| 299 |
+
<button class="tab-btn" data-tab="preview">Preview <span class="tab-badge" id="docTabBadge">0</span></button>
|
| 300 |
+
<button class="tab-btn" data-tab="status">System Status</button>
|
| 301 |
+
</nav>
|
| 302 |
+
|
| 303 |
+
<!-- Chat -->
|
| 304 |
+
<div class="tab-panel active" id="tab-chat">
|
| 305 |
+
<div id="chat-panel" class="active">
|
| 306 |
+
<div class="chat-messages" id="chatMessages">
|
| 307 |
+
<div class="chat-empty" id="chatEmpty">
|
| 308 |
+
<div class="empty-icon">◈</div>
|
| 309 |
+
<div class="empty-title">Ask questions about your documents</div>
|
| 310 |
+
<div class="empty-desc">Upload and process documents first, then ask questions. The AI will search through the documents to find relevant answers.</div>
|
| 311 |
+
<div class="quick-prompts">
|
| 312 |
+
<span class="quick-prompt" data-question="请对这份文档进行详细摘要,列出各章节的主要内容">Summary</span>
|
| 313 |
+
<span class="quick-prompt" data-question="文档中提到了哪些关键数据和重要信息?请分点列出">Key Data</span>
|
| 314 |
+
<span class="quick-prompt" data-question="文档中的表格包含了什么内容?请整理说明">Tables</span>
|
| 315 |
+
<span class="quick-prompt" data-question="文档的核心观点和结论是什么?">Core Ideas</span>
|
| 316 |
+
</div>
|
| 317 |
+
</div>
|
| 318 |
+
</div>
|
| 319 |
+
<div class="sources-toggle" id="sourcesToggle" style="display:none">↓ Sources & References</div>
|
| 320 |
+
<div class="sources-content" id="sourcesContent"></div>
|
| 321 |
+
<div class="chat-input-area">
|
| 322 |
+
<div class="chat-input-row">
|
| 323 |
+
<textarea id="questionInput" placeholder="Ask a question about the documents..." rows="1"></textarea>
|
| 324 |
+
<button class="btn-send" id="sendBtn" disabled>↑</button>
|
| 325 |
+
</div>
|
| 326 |
+
</div>
|
| 327 |
+
</div>
|
| 328 |
+
</div>
|
| 329 |
+
|
| 330 |
+
<!-- Preview -->
|
| 331 |
+
<div class="tab-panel" id="tab-preview">
|
| 332 |
+
<div id="preview-panel">
|
| 333 |
+
<!-- Top: file card row -->
|
| 334 |
+
<div class="doc-cards-wrap" id="docCardsWrap">
|
| 335 |
+
<h3>Processed Files</h3>
|
| 336 |
+
<div class="doc-cards" id="docCards"></div>
|
| 337 |
+
</div>
|
| 338 |
+
<div class="doc-empty-state" id="docEmptyState">
|
| 339 |
+
<div class="de-icon">◈</div>
|
| 340 |
+
<div class="de-title">No documents processed yet</div>
|
| 341 |
+
<div class="de-desc">Upload and process files from the sidebar — they will appear here. Click on a file card to preview the original file and OCR result side by side.</div>
|
| 342 |
+
</div>
|
| 343 |
+
<!-- Split preview pane -->
|
| 344 |
+
<div class="pv-split-wrap" id="pvSplitWrap" style="display:none">
|
| 345 |
+
<div class="pv-split-header">
|
| 346 |
+
<span class="dph-title" id="dphTitle">—</span>
|
| 347 |
+
<span class="dph-meta" id="dphMeta"></span>
|
| 348 |
+
</div>
|
| 349 |
+
<div class="pv-split-body">
|
| 350 |
+
<div class="pv-left" id="pvOriginal">
|
| 351 |
+
<div class="pv-left-placeholder">Select a file to preview</div>
|
| 352 |
+
</div>
|
| 353 |
+
<div class="pv-divider"></div>
|
| 354 |
+
<div class="pv-right" id="pvOcrText">
|
| 355 |
+
<div class="pv-right-placeholder">OCR result will appear here</div>
|
| 356 |
+
</div>
|
| 357 |
+
</div>
|
| 358 |
+
</div>
|
| 359 |
+
</div>
|
| 360 |
+
</div>
|
| 361 |
+
|
| 362 |
+
<!-- Status -->
|
| 363 |
+
<div class="tab-panel" id="tab-status">
|
| 364 |
+
<div id="status-panel">
|
| 365 |
+
<div class="status-card"><h3>Model Stack</h3>
|
| 366 |
+
<div class="model-stack">
|
| 367 |
+
<div class="model-item"><div class="model-label">OCR Engine</div><div class="model-value" id="sOCRModel">—</div><div class="model-sub" id="sOCRBase">—</div></div>
|
| 368 |
+
<div class="model-item"><div class="model-label">Embedding</div><div class="model-value" id="sEmbedModel">—</div><div class="model-sub" id="sEmbedBase">—</div></div>
|
| 369 |
+
<div class="model-item"><div class="model-label">LLM</div><div class="model-value" id="sLLMModel">—</div><div class="model-sub" id="sLLMBase">—</div></div>
|
| 370 |
+
<div class="model-item"><div class="model-label">Vector DB</div><div class="model-value" id="sVectorDB">—</div></div>
|
| 371 |
+
</div>
|
| 372 |
+
</div>
|
| 373 |
+
<div class="status-card"><h3>Processing Parameters</h3>
|
| 374 |
+
<div class="param-row">
|
| 375 |
+
<span class="param-label">Chunk Size</span>
|
| 376 |
+
<div class="param-edit"><input type="range" id="sChunkSize" min="200" max="2000" value="800" step="50"><span class="pe-val" id="sChunkSizeVal">800</span><span style="font-size:10px;color:var(--text-muted)">chars</span></div>
|
| 377 |
+
</div>
|
| 378 |
+
<div class="param-row">
|
| 379 |
+
<span class="param-label">Overlap</span>
|
| 380 |
+
<div class="param-edit"><input type="range" id="sChunkOverlap" min="0" max="500" value="150" step="25"><span class="pe-val" id="sChunkOverlapVal">150</span><span style="font-size:10px;color:var(--text-muted)">chars</span></div>
|
| 381 |
+
</div>
|
| 382 |
+
<div class="param-row" style="margin-top:10px">
|
| 383 |
+
<span class="param-label">Retrieval Top-K</span>
|
| 384 |
+
<div class="param-edit"><input type="number" id="sRetrievalK" min="1" max="20" value="5"></div>
|
| 385 |
+
</div>
|
| 386 |
+
<button class="btn btn-primary btn-sm" onclick="saveProcessingParams()" style="margin-top:14px">Save Parameters</button>
|
| 387 |
+
<span id="paramsMsg" style="font-size:12px;margin-left:10px"></span>
|
| 388 |
+
</div>
|
| 389 |
+
<div class="status-card" id="configCard">
|
| 390 |
+
<h3><span>API Configuration</span><button class="btn btn-primary btn-sm sc-btn" id="editConfigBtn">Edit</button></h3>
|
| 391 |
+
<div id="configDisplay"><div id="configContent"></div></div>
|
| 392 |
+
<div id="configEdit" style="display:none">
|
| 393 |
+
<div class="config-section"><h4>OCR API</h4>
|
| 394 |
+
<div class="form-group"><label>Engine</label><select id="cfgOcrEngine"><option value="paddle">paddle (local)</option><option value="api">api (remote)</option></select></div>
|
| 395 |
+
<div class="form-group"><label>API Base URL</label><input id="cfgOcrBase" placeholder="http://127.0.0.1:8002/v1"></div>
|
| 396 |
+
<div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgOcrKey" type="password" placeholder="not-needed"><button class="toggle-vis" onclick="togglePassword('cfgOcrKey',this)">👁</button></div></div>
|
| 397 |
+
<div class="form-group"><label>Model Name</label><input id="cfgOcrModel" placeholder="PaddleOCR-VL-1.5"></div>
|
| 398 |
+
</div>
|
| 399 |
+
<div class="config-section"><h4>Embedding API</h4>
|
| 400 |
+
<div class="form-group"><label>API Base URL</label><input id="cfgEmbedBase" placeholder="https://dashscope.aliyuncs.com/compatible-mode/v1"></div>
|
| 401 |
+
<div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgEmbedKey" type="password" placeholder="sk-..."><button class="toggle-vis" onclick="togglePassword('cfgEmbedKey',this)">👁</button></div></div>
|
| 402 |
+
<div class="form-group"><label>Model Name</label><input id="cfgEmbedModel" placeholder="text-embedding-v4"></div>
|
| 403 |
+
</div>
|
| 404 |
+
<div class="config-section"><h4>LLM API</h4>
|
| 405 |
+
<div class="form-group"><label>API Base URL</label><input id="cfgLLMBase" placeholder="http://0.0.0.0:8013/v1"></div>
|
| 406 |
+
<div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgLLMKey" type="password" placeholder="not-needed"><button class="toggle-vis" onclick="togglePassword('cfgLLMKey',this)">👁</button></div></div>
|
| 407 |
+
<div class="form-group"><label>Model Name</label><input id="cfgLLMModel" placeholder="Qwen/Qwen3-4B-Instruct-2507"></div>
|
| 408 |
+
</div>
|
| 409 |
+
<div style="display:flex;gap:8px;margin-top:12px"><button class="btn btn-primary btn-sm" onclick="saveConfig()">Save</button><button class="btn btn-sm" onclick="cancelConfigEdit()">Cancel</button></div>
|
| 410 |
+
<div id="configMsg" style="margin-top:8px;font-size:13px"></div>
|
| 411 |
+
</div>
|
| 412 |
+
</div>
|
| 413 |
+
<div class="status-card"><h3>Database</h3><div id="sDBStats">No documents indexed</div></div>
|
| 414 |
+
</div>
|
| 415 |
+
</div>
|
| 416 |
+
</main>
|
| 417 |
+
</div>
|
| 418 |
+
|
| 419 |
+
<div class="toast" id="toast"></div>
|
| 420 |
+
|
| 421 |
+
<script>
|
| 422 |
+
const $=s=>document.querySelector(s);
|
| 423 |
+
const $$=s=>document.querySelectorAll(s);
|
| 424 |
+
const state={files:[],ready:false,processing:false,fileList:[],selectedDoc:-1};
|
| 425 |
+
|
| 426 |
+
const D={
|
| 427 |
+
uploadZone:$('#uploadZone'),fileInput:$('#fileInput'),fileQueue:$('#fileQueue'),
|
| 428 |
+
queueCount:$('#queueCount'),processBtn:$('#processBtn'),
|
| 429 |
+
progressWrap:$('#progressWrap'),progressFill:$('#progressFill'),
|
| 430 |
+
statusDot:$('#statusDot'),statusText:$('#statusText'),processLog:$('#processLog'),
|
| 431 |
+
chatMessages:$('#chatMessages'),chatEmpty:$('#chatEmpty'),
|
| 432 |
+
questionInput:$('#questionInput'),sendBtn:$('#sendBtn'),
|
| 433 |
+
sToggle:$('#sourcesToggle'),sContent:$('#sourcesContent'),
|
| 434 |
+
docCards:$('#docCards'),docCardsWrap:$('#docCardsWrap'),docEmptyState:$('#docEmptyState'),
|
| 435 |
+
pvSplitWrap:$('#pvSplitWrap'),pvOriginal:$('#pvOriginal'),pvOcrText:$('#pvOcrText'),
|
| 436 |
+
dphTitle:$('#dphTitle'),dphMeta:$('#dphMeta'),docTabBadge:$('#docTabBadge'),
|
| 437 |
+
toast:$('#toast'),
|
| 438 |
+
};
|
| 439 |
+
|
| 440 |
+
// ─── Toast ──────────────────────────────────────────────
|
| 441 |
+
let tt;function showToast(m,t){clearTimeout(tt);D.toast.textContent=m;D.toast.className='toast '+t+' show';tt=setTimeout(()=>D.toast.classList.remove('show'),3500)}
|
| 442 |
+
|
| 443 |
+
// ─── Log / Progress ��────────────────────────────────────
|
| 444 |
+
function addLog(m){D.processLog.classList.add('visible');const d=document.createElement('div');d.className='log-entry';d.textContent='> '+m;D.processLog.appendChild(d);D.processLog.scrollTop=D.processLog.scrollHeight}
|
| 445 |
+
function clearLog(){D.processLog.innerHTML='';D.processLog.classList.remove('visible')}
|
| 446 |
+
function setProgress(p){D.progressFill.style.width=p+'%'}
|
| 447 |
+
function showProgress(s){D.progressWrap.classList.toggle('active',s)}
|
| 448 |
+
function setStatus(st,t){D.statusDot.className='status-dot '+st;D.statusText.textContent=t}
|
| 449 |
+
|
| 450 |
+
// ─── File queue ─────────────────────────────────────────
|
| 451 |
+
const VE=['.pdf','.png','.jpg','.jpeg','.bmp','.tif','.tiff'];
|
| 452 |
+
function renderQueue(){
|
| 453 |
+
if(!state.files.length){D.fileQueue.innerHTML='';D.processBtn.disabled=true;D.processBtn.textContent='→ Process Documents';D.queueCount.textContent='0';setStatus('idle','Ready — upload files to begin');return}
|
| 454 |
+
D.fileQueue.innerHTML=state.files.map((f,i)=>`<div class="file-queue-item"><span class="fq-icon">${f.name.toLowerCase().endsWith('.pdf')?'📄':'🖼️'}</span><div class="fq-info"><div class="fq-name" title="${f.name}">${f.name}</div><div class="fq-meta">${(f.size/1024/1024).toFixed(1)} MB</div></div><button class="fq-remove" onclick="removeFile(${i})" title="Remove">×</button></div>`).join('');
|
| 455 |
+
D.processBtn.disabled=false;D.processBtn.textContent=`→ Process ${state.files.length} File${state.files.length>1?'s':''}`;
|
| 456 |
+
D.queueCount.textContent=state.files.length;setStatus('idle',`${state.files.length} file${state.files.length>1?'s':''} queued`);
|
| 457 |
+
}
|
| 458 |
+
function addFiles(a){for(const f of a){const e='.'+f.name.split('.').pop().toLowerCase();if(!VE.includes(e)){showToast('Skipped: '+f.name,'error');continue}if(state.files.some(x=>x.name===f.name&&x.size===f.size))continue;state.files.push(f)}renderQueue()}
|
| 459 |
+
function removeFile(i){state.files.splice(i,1);renderQueue()}
|
| 460 |
+
|
| 461 |
+
// ─── Upload events ──────────────────────────────────────
|
| 462 |
+
D.uploadZone.addEventListener('click',()=>D.fileInput.click());
|
| 463 |
+
D.uploadZone.addEventListener('dragover',e=>{e.preventDefault();D.uploadZone.classList.add('drag-over')});
|
| 464 |
+
D.uploadZone.addEventListener('dragleave',()=>D.uploadZone.classList.remove('drag-over'));
|
| 465 |
+
D.uploadZone.addEventListener('drop',e=>{e.preventDefault();D.uploadZone.classList.remove('drag-over');if(e.dataTransfer.files.length)addFiles(e.dataTransfer.files)});
|
| 466 |
+
D.fileInput.addEventListener('change',()=>{if(D.fileInput.files.length){addFiles(D.fileInput.files);D.fileInput.value=''}});
|
| 467 |
+
|
| 468 |
+
// ─── Process ────────────────────────────────────────────
|
| 469 |
+
D.processBtn.addEventListener('click',async()=>{
|
| 470 |
+
if(!state.files.length||state.processing)return;
|
| 471 |
+
state.processing=true;D.processBtn.disabled=true;D.processBtn.textContent='Processing...';
|
| 472 |
+
clearLog();showProgress(true);setProgress(5);setStatus('processing','Processing documents...');
|
| 473 |
+
addLog(`Processing ${state.files.length} file(s)...`);
|
| 474 |
+
const fd=new FormData();state.files.forEach(f=>fd.append('files',f));
|
| 475 |
+
fd.append('chunk_size',$('#sChunkSize').value);fd.append('chunk_overlap',$('#sChunkOverlap').value);
|
| 476 |
+
const sim=setInterval(()=>{const w=parseFloat(D.progressFill.style.width)||5;setProgress(Math.min(w+(100-w)*0.15,92))},400);
|
| 477 |
+
try{
|
| 478 |
+
const r=await fetch('/api/upload',{method:'POST',body:fd});clearInterval(sim);setProgress(100);
|
| 479 |
+
if(!r.ok){const e=await r.json();throw new Error(e.detail||'Upload failed')}
|
| 480 |
+
const d=await r.json();state.ready=true;
|
| 481 |
+
if(d.results&&d.results.length){d.results.forEach(r=>addLog(`✓ ${r.name}: ${r.pages}p · ${r.chunks}c`));setStatus('ready',`${d.total} file(s) processed · Ready for Q&A`);showToast(d.total+' document(s) processed!','success')}
|
| 482 |
+
if(d.errors&&d.errors.length)d.errors.forEach(e=>addLog(`ERROR: ${e}`));
|
| 483 |
+
refreshFileList();refreshStatus();D.chatEmpty.style.display='none';D.sendBtn.disabled=false;D.questionInput.focus();
|
| 484 |
+
state.files=[];renderQueue();
|
| 485 |
+
}catch(e){clearInterval(sim);setProgress(0);setStatus('error','Processing failed');addLog(`ERROR: ${e.message}`);showToast(e.message,'error')}
|
| 486 |
+
finally{showProgress(false);state.processing=false;D.processBtn.textContent='→ Process Documents';D.processBtn.disabled=state.files.length===0}
|
| 487 |
+
});
|
| 488 |
+
|
| 489 |
+
// ─── Escape HTML ────────────────────────────────────────
|
| 490 |
+
function esc(s){const d=document.createElement('div');d.textContent=s;return d.innerHTML}
|
| 491 |
+
|
| 492 |
+
// ─── Documents tab — load preview ───────────────────────
|
| 493 |
+
async function loadPreview(idx){
|
| 494 |
+
state.selectedDoc=idx;
|
| 495 |
+
const f=state.fileList[idx];if(!f)return;
|
| 496 |
+
// Highlight card & sidebar item
|
| 497 |
+
$$('.doc-card').forEach((c,i)=>c.classList.toggle('selected',i===idx));
|
| 498 |
+
$$('.sb-pf-item').forEach((c,i)=>c.classList.toggle('selected',i===idx));
|
| 499 |
+
// Show split view
|
| 500 |
+
D.pvSplitWrap.style.display='flex';D.docEmptyState.style.display='none';
|
| 501 |
+
D.dphTitle.textContent=f.name;D.dphMeta.textContent='';
|
| 502 |
+
|
| 503 |
+
// Left: original file
|
| 504 |
+
const isPdf=f.format==='.pdf';
|
| 505 |
+
D.pvOriginal.innerHTML=isPdf
|
| 506 |
+
? `<iframe src="/api/file/${idx}"></iframe>`
|
| 507 |
+
: `<img src="/api/file/${idx}" alt="${f.name}">`;
|
| 508 |
+
|
| 509 |
+
// Right: loading
|
| 510 |
+
D.pvOcrText.innerHTML='<div style="text-align:center;color:var(--text-muted);padding:60px">Loading OCR result...</div>';
|
| 511 |
+
|
| 512 |
+
// Fetch OCR
|
| 513 |
+
try{
|
| 514 |
+
const r=await fetch('/api/preview/'+idx);
|
| 515 |
+
if(!r.ok){D.pvOcrText.innerHTML=`<div style="text-align:center;color:var(--accent-red);padding:60px">OCR text not available (${r.status})</div>`;return}
|
| 516 |
+
const d=await r.json();
|
| 517 |
+
if(d.success&&d.text){
|
| 518 |
+
const parts=d.text.split(/--- 第 (\d+) 页 ---/g);let h='';
|
| 519 |
+
for(let i=1;i<parts.length;i+=2)h+=`<div class="pv-page-header">Page ${parts[i]}</div><div>${esc(parts[i+1]||'')}</div>`;
|
| 520 |
+
if(!h&&d.text)h=`<div>${esc(d.text)}</div>`;
|
| 521 |
+
D.pvOcrText.innerHTML=h||'<div style="text-align:center;color:var(--text-muted);padding:60px">No text content</div>';
|
| 522 |
+
const pg=parts.length>>1;
|
| 523 |
+
D.dphMeta.textContent=`${pg} pages · ${d.text.length} chars`;
|
| 524 |
+
}else{D.pvOcrText.innerHTML='<div style="text-align:center;color:var(--text-muted);padding:60px">OCR result is empty</div>'}
|
| 525 |
+
}catch(e){D.pvOcrText.innerHTML=`<div style="text-align:center;color:var(--accent-red);padding:60px">Failed to load: ${e.message}</div>`}
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
// ─── Documents tab — file list ──────────────────────────
|
| 529 |
+
async function refreshFileList(){
|
| 530 |
+
try{const r=await fetch('/api/status');const d=await r.json();state.fileList=d.files||[];
|
| 531 |
+
// Sidebar list
|
| 532 |
+
const sfl=$('#sidebarFileList');
|
| 533 |
+
const spc=$('#sidebarPfCount');
|
| 534 |
+
if(state.fileList.length){
|
| 535 |
+
sfl.innerHTML=state.fileList.map((f,i)=>`<div class="sb-pf-item${i===state.selectedDoc?' selected':''}" onclick="switchTab('preview');loadPreview(${i})"><span class="sb-icon">${f.format==='.pdf'?'📄':'🖼️'}</span><span class="sb-name" title="${f.name}">${f.name}</span><span class="sb-meta">${f.pages}p·${f.chunks}c</span><button class="sb-del" onclick="event.stopPropagation();deleteDoc(${i})" title="Remove">×</button></div>`).join('');
|
| 536 |
+
spc.textContent=state.fileList.length;
|
| 537 |
+
}else{
|
| 538 |
+
sfl.innerHTML='<div class="sb-pf-empty">No files processed yet</div>';
|
| 539 |
+
spc.textContent='0';
|
| 540 |
+
}
|
| 541 |
+
// Documents tab
|
| 542 |
+
if(state.fileList.length){
|
| 543 |
+
D.docCardsWrap.style.display='';D.docEmptyState.style.display='none';
|
| 544 |
+
D.docCards.innerHTML=state.fileList.map((f,i)=>`<div class="doc-card${i===state.selectedDoc?' selected':''}" onclick="loadPreview(${i})"><div class="dc-icon ${f.format==='.pdf'?'pdf':'img'}">${f.format==='.pdf'?'PDF':'IMG'}</div><div class="dc-name" title="${f.name}">${f.name}</div><div class="dc-stats"><span>📄 <span class="dc-val">${f.pages}</span>p</span><span>🧩 <span class="dc-val">${f.chunks}</span>c</span><span>💾 <span class="dc-val">${f.size_mb}</span>MB</span></div><button class="dc-delete" onclick="event.stopPropagation();deleteDoc(${i})" title="Remove">✕</button><div style="font-family:var(--font-mono);font-size:9px;color:var(--text-muted);margin-top:6px">${f.time}</div></div>`).join('');
|
| 545 |
+
if(state.selectedDoc>=0&&state.selectedDoc<state.fileList.length)D.pvSplitWrap.style.display='flex';
|
| 546 |
+
}else{
|
| 547 |
+
D.docCardsWrap.style.display='none';D.docCards.innerHTML='';D.docEmptyState.style.display='flex';
|
| 548 |
+
D.pvSplitWrap.style.display='none';state.selectedDoc=-1;
|
| 549 |
+
}
|
| 550 |
+
D.docTabBadge.textContent=state.fileList.length;
|
| 551 |
+
}catch(e){}
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
function switchTab(name){
|
| 555 |
+
$$('.tab-btn').forEach(x=>x.classList.remove('active'));
|
| 556 |
+
$$('.tab-panel').forEach(x=>x.classList.remove('active'));
|
| 557 |
+
const btn=document.querySelector(`[data-tab="${name}"]`);
|
| 558 |
+
if(btn)btn.classList.add('active');
|
| 559 |
+
const panel=$('#tab-'+name);
|
| 560 |
+
if(panel)panel.classList.add('active');
|
| 561 |
+
if(name==='status')refreshStatus();
|
| 562 |
+
if(name==='preview')refreshFileList();
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
async function deleteDoc(idx){
|
| 566 |
+
try{const r=await fetch('/api/files/'+idx,{method:'DELETE'});if(r.ok){if(state.selectedDoc===idx)state.selectedDoc=-1;refreshFileList();refreshStatus();showToast('File removed','success')}else showToast('Failed to remove','error')}catch(e){showToast(e.message,'error')}
|
| 567 |
+
}
|
| 568 |
+
|
| 569 |
+
// ─── Status ─────────────────────────────────────────────
|
| 570 |
+
async function refreshStatus(){
|
| 571 |
+
try{const r=await fetch('/api/status');const d=await r.json();
|
| 572 |
+
if(d.ocr){$('#sOCRModel').textContent=d.ocr.model||'PaddleOCR-VL-1.5';$('#sOCRBase').textContent=d.ocr.api_base||(d.ocr.engine==='paddle'?'local':'')}
|
| 573 |
+
if(d.embedding){$('#sEmbedModel').textContent=d.embedding.model||'���';$('#sEmbedBase').textContent=d.embedding.api_base||''}
|
| 574 |
+
if(d.llm){$('#sLLMModel').textContent=d.llm.model||'—';$('#sLLMBase').textContent=d.llm.api_base||''}
|
| 575 |
+
if(d.vector_store)$('#sVectorDB').textContent=d.vector_store;
|
| 576 |
+
if(d.params){$('#sChunkSize').value=d.params.chunk_size;$('#sChunkSizeVal').textContent=d.params.chunk_size;$('#sChunkOverlap').value=d.params.chunk_overlap;$('#sChunkOverlapVal').textContent=d.params.chunk_overlap;$('#sRetrievalK').value=d.params.retrieval_top_k}
|
| 577 |
+
if(d.document_count!==undefined)$('#sDBStats').innerHTML=`<div class="param-row"><span class="param-label">Indexed Chunks</span><span class="param-value">${d.document_count}</span></div>`;
|
| 578 |
+
$('#configContent').innerHTML=`<div class="config-readonly">OCR: ${d.ocr.engine} | ${d.ocr.model||'—'} ${d.ocr.api_key?'| key: '+d.ocr.api_key:''}</div><div class="config-readonly">Embed: ${d.embedding.model} ${d.embedding.api_key?'| key: '+d.embedding.api_key:''}</div><div class="config-readonly">LLM: ${d.llm.model} ${d.llm.api_key?'| key: '+d.llm.api_key:''}</div>`;
|
| 579 |
+
const cr=await fetch('/api/config');const c=await cr.json();
|
| 580 |
+
if(c.ocr){$('#cfgOcrEngine').value=c.ocr.engine||'paddle';$('#cfgOcrBase').value=c.ocr.api_base||'';$('#cfgOcrKey').value=c.ocr.api_key||'';$('#cfgOcrModel').value=c.ocr.model_name||''}
|
| 581 |
+
if(c.embedding){$('#cfgEmbedBase').value=c.embedding.api_base||'';$('#cfgEmbedKey').value=c.embedding.api_key||'';$('#cfgEmbedModel').value=c.embedding.model_name||''}
|
| 582 |
+
if(c.llm){$('#cfgLLMBase').value=c.llm.api_base||'';$('#cfgLLMKey').value=c.llm.api_key||'';$('#cfgLLMModel').value=c.llm.model_name||''}
|
| 583 |
+
}catch(e){}
|
| 584 |
+
}
|
| 585 |
+
|
| 586 |
+
// ─── Params ─────────────────────────────────────────────
|
| 587 |
+
$('#sChunkSize').addEventListener('input',()=>$('#sChunkSizeVal').textContent=$('#sChunkSize').value);
|
| 588 |
+
$('#sChunkOverlap').addEventListener('input',()=>$('#sChunkOverlapVal').textContent=$('#sChunkOverlap').value);
|
| 589 |
+
async function saveProcessingParams(){
|
| 590 |
+
const u={CHUNK_SIZE:$('#sChunkSize').value,CHUNK_OVERLAP:$('#sChunkOverlap').value,RETRIEVAL_TOP_K:$('#sRetrievalK').value};
|
| 591 |
+
try{const r=await fetch('/api/config',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(u)});const d=await r.json();
|
| 592 |
+
if(d.success){$('#paramsMsg').innerHTML='<span style="color:var(--accent-green)">Saved</span>';setTimeout(()=>$('#paramsMsg').innerHTML='',2000)}else $('#paramsMsg').innerHTML='<span style="color:var(--accent-red)">Failed</span>';
|
| 593 |
+
}catch(e){$('#paramsMsg').innerHTML='<span style="color:var(--accent-red)">'+e.message+'</span>'}
|
| 594 |
+
}
|
| 595 |
+
|
| 596 |
+
// ─── Chat ───────────────────────────────────────────────
|
| 597 |
+
function addMessage(role,content){D.chatEmpty.style.display='none';const d=document.createElement('div');d.className='message '+role;d.innerHTML=`<div class="msg-avatar">${role==='user'?'You':'AI'}</div><div class="msg-bubble">${fmt(content)}</div>`;D.chatMessages.appendChild(d);D.chatMessages.scrollTop=D.chatMessages.scrollHeight}
|
| 598 |
+
function addTyping(){const d=document.createElement('div');d.className='message assistant';d.id='typingMsg';d.innerHTML='<div class="msg-avatar">AI</div><div class="msg-bubble"><div class="typing-indicator"><span></span><span></span><span></span></div></div>';D.chatMessages.appendChild(d);D.chatMessages.scrollTop=D.chatMessages.scrollHeight}
|
| 599 |
+
function remTyping(){const e=document.getElementById('typingMsg');if(e)e.remove()}
|
| 600 |
+
function fmt(t){if(!t)return'';let h=t.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/\*\*(.+?)\*\*/g,'<strong>$1</strong>').replace(/\*(.+?)\*/g,'<em>$1</em>').replace(/`([^`]+)`/g,'<code>$1</code>');return'<p>'+h.replace(/\n\n/g,'</p><p>').replace(/\n/g,'<br>')+'</p>'}
|
| 601 |
+
async function sendQuestion(){
|
| 602 |
+
const q=D.questionInput.value.trim();if(!q||!state.ready)return;
|
| 603 |
+
D.questionInput.value='';D.sendBtn.disabled=true;D.questionInput.style.height='auto';
|
| 604 |
+
addMessage('user',q);addTyping();
|
| 605 |
+
try{const r=await fetch('/api/chat',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({question:q})});if(!r.ok){const e=await r.json();throw new Error(e.detail||'Chat failed')}const d=await r.json();remTyping();addMessage('assistant',d.answer);
|
| 606 |
+
if(d.sources&&d.sources.length){const sh=d.sources.map(s=>`<div class="source-item"><div class="src-header"><span>#${s.rank}</span><span>${s.document||''}</span><span>Page ${s.page}</span><span>${s.content_type||''}</span></div><div class="src-excerpt">${esc((s.content||'').substring(0,200))}...</div></div>`).join('');D.sToggle.style.display='flex';D.sContent.innerHTML=sh}
|
| 607 |
+
}catch(e){remTyping();addMessage('assistant','Error: '+e.message);showToast(e.message,'error')}
|
| 608 |
+
finally{D.sendBtn.disabled=false;D.questionInput.focus()}
|
| 609 |
+
}
|
| 610 |
+
D.sendBtn.addEventListener('click',sendQuestion);
|
| 611 |
+
D.questionInput.addEventListener('keydown',e=>{if(e.key==='Enter'&&!e.shiftKey){e.preventDefault();sendQuestion()}});
|
| 612 |
+
D.questionInput.addEventListener('input',()=>{D.questionInput.style.height='auto';D.questionInput.style.height=Math.min(D.questionInput.scrollHeight,120)+'px'});
|
| 613 |
+
document.addEventListener('click',e=>{if(e.target.classList.contains('quick-prompt')){D.questionInput.value=e.target.dataset.question;D.questionInput.focus();D.questionInput.style.height='auto';D.questionInput.style.height=Math.min(D.questionInput.scrollHeight,120)+'px'}});
|
| 614 |
+
D.sToggle.addEventListener('click',()=>{D.sContent.classList.toggle('open');D.sToggle.textContent=D.sContent.classList.contains('open')?'↑ Hide Sources & References':'↓ Sources & References'});
|
| 615 |
+
|
| 616 |
+
// ─── Tabs ───────────────────────────────────────────────
|
| 617 |
+
$$('.tab-btn').forEach(b=>b.addEventListener('click',()=>{$$('.tab-btn').forEach(x=>x.classList.remove('active'));$$('.tab-panel').forEach(x=>x.classList.remove('active'));b.classList.add('active');const t=$('#tab-'+b.dataset.tab);if(t)t.classList.add('active');if(b.dataset.tab==='status')refreshStatus();if(b.dataset.tab==='preview')refreshFileList()}));
|
| 618 |
+
|
| 619 |
+
// ─── Config ─────────────────────────────────────────────
|
| 620 |
+
function togglePassword(id,btn){const i=document.getElementById(id);if(i.type==='password'){i.type='text';btn.textContent='—'}else{i.type='password';btn.textContent='👁'}}
|
| 621 |
+
function toggleConfigEdit(){const d=$('#configDisplay'),e=$('#configEdit'),b=$('#editConfigBtn');if(e.style.display==='none'){d.style.display='none';e.style.display='block';b.textContent='Cancel';refreshStatus()}else cancelConfigEdit()}
|
| 622 |
+
function cancelConfigEdit(){$('#configDisplay').style.display='block';$('#configEdit').style.display='none';$('#editConfigBtn').textContent='Edit';$('#configMsg').innerHTML=''}
|
| 623 |
+
async function saveConfig(){const u={};u['OCR_ENGINE']=$('#cfgOcrEngine').value;u['OCR_API_BASE']=$('#cfgOcrBase').value;u['OCR_API_KEY']=$('#cfgOcrKey').value;u['OCR_API_MODEL']=$('#cfgOcrModel').value;u['EMBEDDING_API_BASE']=$('#cfgEmbedBase').value;u['EMBEDDING_API_KEY']=$('#cfgEmbedKey').value;u['EMBEDDING_MODEL_NAME']=$('#cfgEmbedModel').value;u['LLM_API_BASE']=$('#cfgLLMBase').value;u['LLM_API_KEY']=$('#cfgLLMKey').value;u['LLM_MODEL_NAME']=$('#cfgLLMModel').value;
|
| 624 |
+
try{const r=await fetch('/api/config',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(u)});const d=await r.json();if(d.success){$('#configMsg').innerHTML='<span style="color:var(--accent-green)">Config saved. Restart to apply.</span>';setTimeout(()=>{cancelConfigEdit();refreshStatus()},1500)}else $('#configMsg').innerHTML='<span style="color:var(--accent-red)">Save failed</span>'}catch(e){$('#configMsg').innerHTML='<span style="color:var(--accent-red)">'+e.message+'</span>'}
|
| 625 |
+
}
|
| 626 |
+
document.getElementById('editConfigBtn').addEventListener('click',toggleConfigEdit);
|
| 627 |
+
|
| 628 |
+
// ─── Clear chat ─────────────────────────────────────────
|
| 629 |
+
async function clearChat(){try{await fetch('/api/chat',{method:'DELETE'})}catch(e){}D.chatMessages.innerHTML='';D.chatMessages.appendChild(D.chatEmpty);D.chatEmpty.style.display='flex';D.sToggle.style.display='none';D.sContent.innerHTML='';D.sContent.classList.remove('open');D.questionInput.value='';D.questionInput.focus()}
|
| 630 |
+
document.addEventListener('keydown',e=>{if(e.ctrlKey&&e.shiftKey&&e.key==='K'){e.preventDefault();clearChat()}});
|
| 631 |
+
D.chatMessages.addEventListener('contextmenu',e=>{e.preventDefault();if(confirm('Clear all chat history?'))clearChat()});
|
| 632 |
+
|
| 633 |
+
// ─── Init ───────────────────────────────────────────────
|
| 634 |
+
renderQueue();refreshStatus();refreshFileList();
|
| 635 |
+
</script>
|
| 636 |
+
</body>
|
| 637 |
+
</html>
|
test.png
ADDED
|
Git LFS Details
|
text_processor.py
ADDED
|
@@ -0,0 +1,606 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
文本处理模块: Markdown 清洗 + 智能分割 (Chunking)
|
| 4 |
+
============================================================
|
| 5 |
+
适配 PaddleOCR-VL-1.5 输出的 Markdown 格式文本
|
| 6 |
+
|
| 7 |
+
功能:
|
| 8 |
+
1. Markdown 文本清洗 (保留表格/公式结构)
|
| 9 |
+
2. 基于 LangChain 的语义感知分割
|
| 10 |
+
3. 表格/公式专项处理
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import re
|
| 14 |
+
from typing import List, Optional, Callable
|
| 15 |
+
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
|
| 18 |
+
from loguru import logger
|
| 19 |
+
|
| 20 |
+
import config
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ============================================================
|
| 24 |
+
# 内置递归文本分割器 (替代 langchain_text_splitters)
|
| 25 |
+
# ============================================================
|
| 26 |
+
# 避免 langchain_text_splitters → sentence_transformers → transformers
|
| 27 |
+
# 的传递依赖链在部分环境中导致的兼容性问题
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class RecursiveCharacterTextSplitter:
|
| 31 |
+
"""
|
| 32 |
+
递归字符文本分割器
|
| 33 |
+
|
| 34 |
+
与 langchain_text_splitters.RecursiveCharacterTextSplitter 接口兼容,
|
| 35 |
+
按分隔符优先级逐级分割, 保持语义完整性。
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(
|
| 39 |
+
self,
|
| 40 |
+
chunk_size: int = 800,
|
| 41 |
+
chunk_overlap: int = 150,
|
| 42 |
+
separators: Optional[List[str]] = None,
|
| 43 |
+
add_start_index: bool = True,
|
| 44 |
+
length_function: Callable[[str], int] = len,
|
| 45 |
+
keep_separator: bool = True,
|
| 46 |
+
strip_whitespace: bool = True,
|
| 47 |
+
):
|
| 48 |
+
self.chunk_size = chunk_size
|
| 49 |
+
self.chunk_overlap = chunk_overlap
|
| 50 |
+
self.separators = separators or ["\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";", " ", ""]
|
| 51 |
+
self.add_start_index = add_start_index
|
| 52 |
+
self.length_function = length_function
|
| 53 |
+
self.keep_separator = keep_separator
|
| 54 |
+
self.strip_whitespace = strip_whitespace
|
| 55 |
+
|
| 56 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 57 |
+
"""分割 Document 列表"""
|
| 58 |
+
chunks = []
|
| 59 |
+
for doc in documents:
|
| 60 |
+
doc_chunks = self.split_text(doc.page_content, doc.metadata)
|
| 61 |
+
chunks.extend(doc_chunks)
|
| 62 |
+
return chunks
|
| 63 |
+
|
| 64 |
+
def split_text(self, text: str, metadata: Optional[dict] = None) -> List[Document]:
|
| 65 |
+
"""分割单个文本, 返回 Document 列表"""
|
| 66 |
+
metadata = metadata or {}
|
| 67 |
+
splits = self._split(text, self.separators)
|
| 68 |
+
chunks = self._merge(splits)
|
| 69 |
+
|
| 70 |
+
docs = []
|
| 71 |
+
for i, chunk in enumerate(chunks):
|
| 72 |
+
chunk_meta = {**metadata}
|
| 73 |
+
if self.add_start_index:
|
| 74 |
+
chunk_meta["start_index"] = text.find(chunk) if chunk in text else 0
|
| 75 |
+
docs.append(Document(page_content=chunk, metadata=chunk_meta))
|
| 76 |
+
return docs
|
| 77 |
+
|
| 78 |
+
def create_documents(
|
| 79 |
+
self, texts: List[str], metadatas: Optional[List[dict]] = None
|
| 80 |
+
) -> List[Document]:
|
| 81 |
+
"""从文本列表创建 Document 列表"""
|
| 82 |
+
metadatas = metadatas or [{}] * len(texts)
|
| 83 |
+
docs = []
|
| 84 |
+
for text, meta in zip(texts, metadatas):
|
| 85 |
+
docs.extend(self.split_text(text, meta))
|
| 86 |
+
return docs
|
| 87 |
+
|
| 88 |
+
def _split(self, text: str, separators: List[str]) -> List[str]:
|
| 89 |
+
"""递归分割"""
|
| 90 |
+
# 使用最合适的分隔符
|
| 91 |
+
sep = separators[-1] # 默认用最后一个 (空字符串, 按字符分割)
|
| 92 |
+
for s in separators:
|
| 93 |
+
if s == "":
|
| 94 |
+
sep = s
|
| 95 |
+
break
|
| 96 |
+
if s in text:
|
| 97 |
+
sep = s
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
# 按分隔符分割
|
| 101 |
+
if sep == "":
|
| 102 |
+
# 按字符分割
|
| 103 |
+
splits = list(text)
|
| 104 |
+
else:
|
| 105 |
+
if self.keep_separator:
|
| 106 |
+
# 保留分隔符在片段末尾
|
| 107 |
+
parts = text.split(sep)
|
| 108 |
+
splits = []
|
| 109 |
+
for i, part in enumerate(parts):
|
| 110 |
+
if i > 0:
|
| 111 |
+
splits.append(sep + part)
|
| 112 |
+
else:
|
| 113 |
+
splits.append(part)
|
| 114 |
+
else:
|
| 115 |
+
splits = text.split(sep)
|
| 116 |
+
|
| 117 |
+
# 去除空白并过滤空字符串
|
| 118 |
+
if self.strip_whitespace:
|
| 119 |
+
splits = [s.strip() for s in splits]
|
| 120 |
+
splits = [s for s in splits if s]
|
| 121 |
+
|
| 122 |
+
# 递归处理超长片段
|
| 123 |
+
final_splits = []
|
| 124 |
+
for split in splits:
|
| 125 |
+
if self.length_function(split) <= self.chunk_size:
|
| 126 |
+
final_splits.append(split)
|
| 127 |
+
else:
|
| 128 |
+
# 片段仍超长, 用下一级分隔符递归分割
|
| 129 |
+
if len(separators) > 1:
|
| 130 |
+
next_seps = separators[separators.index(sep) + 1 :]
|
| 131 |
+
final_splits.extend(self._split(split, next_seps))
|
| 132 |
+
else:
|
| 133 |
+
# 无法再分, 强制按字符切分
|
| 134 |
+
forced = self._force_split(split)
|
| 135 |
+
final_splits.extend(forced)
|
| 136 |
+
|
| 137 |
+
return final_splits
|
| 138 |
+
|
| 139 |
+
def _force_split(self, text: str) -> List[str]:
|
| 140 |
+
"""强制按字符数切分 (兜底)"""
|
| 141 |
+
chunks = []
|
| 142 |
+
for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
|
| 143 |
+
chunk = text[i : i + self.chunk_size]
|
| 144 |
+
if self.strip_whitespace:
|
| 145 |
+
chunk = chunk.strip()
|
| 146 |
+
if chunk:
|
| 147 |
+
chunks.append(chunk)
|
| 148 |
+
return chunks
|
| 149 |
+
|
| 150 |
+
def _merge(self, splits: List[str]) -> List[str]:
|
| 151 |
+
"""合并短片段为 chunk_size 大小的块"""
|
| 152 |
+
if not splits:
|
| 153 |
+
return []
|
| 154 |
+
|
| 155 |
+
chunks = []
|
| 156 |
+
current = ""
|
| 157 |
+
current_len = 0
|
| 158 |
+
|
| 159 |
+
for split in splits:
|
| 160 |
+
split_len = self.length_function(split)
|
| 161 |
+
|
| 162 |
+
if current_len + split_len <= self.chunk_size:
|
| 163 |
+
if current:
|
| 164 |
+
current += "\n\n" + split
|
| 165 |
+
current_len += 2 + split_len
|
| 166 |
+
else:
|
| 167 |
+
current = split
|
| 168 |
+
current_len = split_len
|
| 169 |
+
else:
|
| 170 |
+
if current:
|
| 171 |
+
chunks.append(current)
|
| 172 |
+
# 重叠: 保留前一块的尾部
|
| 173 |
+
if self.chunk_overlap > 0 and current:
|
| 174 |
+
overlap_text = current[-self.chunk_overlap:]
|
| 175 |
+
current = overlap_text + "\n\n" + split
|
| 176 |
+
current_len = self.length_function(current)
|
| 177 |
+
else:
|
| 178 |
+
current = split
|
| 179 |
+
current_len = split_len
|
| 180 |
+
|
| 181 |
+
if current:
|
| 182 |
+
chunks.append(current)
|
| 183 |
+
|
| 184 |
+
return chunks
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# ============================================================
|
| 188 |
+
# Markdown 文本清洗器
|
| 189 |
+
# ============================================================
|
| 190 |
+
|
| 191 |
+
class MarkdownTextCleaner:
|
| 192 |
+
"""PaddleOCR-VL-1.5 Markdown 输出清洗"""
|
| 193 |
+
|
| 194 |
+
@staticmethod
|
| 195 |
+
def clean(text: str, preserve_structure: bool = True) -> str:
|
| 196 |
+
"""
|
| 197 |
+
清洗 Markdown 文本
|
| 198 |
+
- 保留表格 (|...|) 和公式 ($...$ / $$...$$)
|
| 199 |
+
- 规范化空白和换行
|
| 200 |
+
- 移除 OCR 残留噪声
|
| 201 |
+
"""
|
| 202 |
+
if not text:
|
| 203 |
+
return ""
|
| 204 |
+
|
| 205 |
+
cleaned = text.strip()
|
| 206 |
+
|
| 207 |
+
# 移除控制字符 (保留换行和制表符)
|
| 208 |
+
cleaned = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', cleaned)
|
| 209 |
+
|
| 210 |
+
# 统一换行符
|
| 211 |
+
cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
|
| 212 |
+
|
| 213 |
+
# 规范化空白 (但不影响表格结构)
|
| 214 |
+
if preserve_structure:
|
| 215 |
+
# 保护表格行和代码块
|
| 216 |
+
lines = cleaned.split('\n')
|
| 217 |
+
cleaned_lines = []
|
| 218 |
+
in_table = False
|
| 219 |
+
in_code = False
|
| 220 |
+
|
| 221 |
+
for line in lines:
|
| 222 |
+
# 检测 Markdown 表格
|
| 223 |
+
if line.strip().startswith('|') and '|' in line.strip()[1:]:
|
| 224 |
+
in_table = True
|
| 225 |
+
cleaned_lines.append(line.rstrip())
|
| 226 |
+
elif in_table and re.match(r'^[\s\|:\-]+$', line):
|
| 227 |
+
# 表格分隔行
|
| 228 |
+
cleaned_lines.append(line.rstrip())
|
| 229 |
+
elif in_table and not line.strip().startswith('|'):
|
| 230 |
+
in_table = False
|
| 231 |
+
if line.strip():
|
| 232 |
+
cleaned_lines.append(line.strip())
|
| 233 |
+
elif cleaned_lines and cleaned_lines[-1] != '':
|
| 234 |
+
cleaned_lines.append('')
|
| 235 |
+
elif line.strip().startswith('```'):
|
| 236 |
+
in_code = not in_code
|
| 237 |
+
cleaned_lines.append(line.rstrip())
|
| 238 |
+
elif in_code:
|
| 239 |
+
cleaned_lines.append(line.rstrip())
|
| 240 |
+
else:
|
| 241 |
+
# 普通行: 去除首尾空白, 合并多个空格
|
| 242 |
+
stripped = re.sub(r' +', ' ', line.strip())
|
| 243 |
+
if stripped:
|
| 244 |
+
cleaned_lines.append(stripped)
|
| 245 |
+
elif cleaned_lines and cleaned_lines[-1] != '':
|
| 246 |
+
cleaned_lines.append('')
|
| 247 |
+
|
| 248 |
+
cleaned = '\n'.join(cleaned_lines)
|
| 249 |
+
else:
|
| 250 |
+
cleaned = re.sub(r' +', ' ', cleaned)
|
| 251 |
+
cleaned = re.sub(r' *\n *', '\n', cleaned)
|
| 252 |
+
|
| 253 |
+
# 压缩过多连续空行
|
| 254 |
+
cleaned = re.sub(r'\n{4,}', '\n\n\n', cleaned)
|
| 255 |
+
|
| 256 |
+
return cleaned.strip()
|
| 257 |
+
|
| 258 |
+
@staticmethod
|
| 259 |
+
def clean_documents(documents: List[Document]) -> List[Document]:
|
| 260 |
+
"""批量清洗 Document 列表"""
|
| 261 |
+
cleaned_docs = []
|
| 262 |
+
for doc in documents:
|
| 263 |
+
original_len = len(doc.page_content)
|
| 264 |
+
cleaned_text = MarkdownTextCleaner.clean(doc.page_content)
|
| 265 |
+
cleaned_len = len(cleaned_text)
|
| 266 |
+
|
| 267 |
+
if cleaned_text:
|
| 268 |
+
cleaned_doc = Document(
|
| 269 |
+
page_content=cleaned_text,
|
| 270 |
+
metadata={
|
| 271 |
+
**doc.metadata,
|
| 272 |
+
"cleaned": True,
|
| 273 |
+
"original_length": original_len,
|
| 274 |
+
"cleaned_length": cleaned_len,
|
| 275 |
+
},
|
| 276 |
+
)
|
| 277 |
+
cleaned_docs.append(cleaned_doc)
|
| 278 |
+
else:
|
| 279 |
+
logger.debug(
|
| 280 |
+
f"页面 {doc.metadata.get('page', '?')} 清洗后为空, 已跳过"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
logger.info(
|
| 284 |
+
f"文本清洗: {len(documents)} → {len(cleaned_docs)} 个文档 "
|
| 285 |
+
f"(移除 {len(documents) - len(cleaned_docs)} 个空白页)"
|
| 286 |
+
)
|
| 287 |
+
return cleaned_docs
|
| 288 |
+
|
| 289 |
+
@staticmethod
|
| 290 |
+
def extract_tables_as_chunks(documents: List[Document]) -> List[Document]:
|
| 291 |
+
"""
|
| 292 |
+
将 Markdown 表格提取为独立的文本块
|
| 293 |
+
PaddleOCR-VL-1.5 已输出标准 Markdown 表格格式
|
| 294 |
+
"""
|
| 295 |
+
table_docs = []
|
| 296 |
+
for doc in documents:
|
| 297 |
+
tables_html = doc.metadata.get("tables_html", [])
|
| 298 |
+
tables_md = doc.metadata.get("tables_markdown", [])
|
| 299 |
+
|
| 300 |
+
for i, (html, md) in enumerate(
|
| 301 |
+
zip(tables_html, tables_md or [""] * len(tables_html))
|
| 302 |
+
):
|
| 303 |
+
content = md or html
|
| 304 |
+
if content.strip():
|
| 305 |
+
table_doc = Document(
|
| 306 |
+
page_content=f"[表格数据]\n{content}",
|
| 307 |
+
metadata={
|
| 308 |
+
**doc.metadata,
|
| 309 |
+
"content_type": "table",
|
| 310 |
+
"table_index": i,
|
| 311 |
+
"table_html": html,
|
| 312 |
+
"table_markdown": md,
|
| 313 |
+
},
|
| 314 |
+
)
|
| 315 |
+
table_docs.append(table_doc)
|
| 316 |
+
|
| 317 |
+
if table_docs:
|
| 318 |
+
logger.info(f"提取了 {len(table_docs)} 个表格块")
|
| 319 |
+
return table_docs
|
| 320 |
+
|
| 321 |
+
@staticmethod
|
| 322 |
+
def extract_formulas_as_chunks(documents: List[Document]) -> List[Document]:
|
| 323 |
+
"""将 LaTeX 公式提取为独立块"""
|
| 324 |
+
formula_docs = []
|
| 325 |
+
for doc in documents:
|
| 326 |
+
formulas_latex = doc.metadata.get("formulas_latex", [])
|
| 327 |
+
for i, latex in enumerate(formulas_latex):
|
| 328 |
+
if latex.strip():
|
| 329 |
+
formula_doc = Document(
|
| 330 |
+
page_content=f"[公式]\n$${latex}$$",
|
| 331 |
+
metadata={
|
| 332 |
+
**doc.metadata,
|
| 333 |
+
"content_type": "formula",
|
| 334 |
+
"formula_index": i,
|
| 335 |
+
"formula_latex": latex,
|
| 336 |
+
},
|
| 337 |
+
)
|
| 338 |
+
formula_docs.append(formula_doc)
|
| 339 |
+
|
| 340 |
+
if formula_docs:
|
| 341 |
+
logger.info(f"提取了 {len(formula_docs)} 个公式块")
|
| 342 |
+
return formula_docs
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
# ============================================================
|
| 346 |
+
# 智能文本分割器
|
| 347 |
+
# ============================================================
|
| 348 |
+
|
| 349 |
+
class DocumentSplitter:
|
| 350 |
+
"""
|
| 351 |
+
文档智能分割器
|
| 352 |
+
|
| 353 |
+
针对 PaddleOCR-VL-1.5 的 Markdown 输出优化:
|
| 354 |
+
- 在 Markdown 标题处分段
|
| 355 |
+
- 保护表格完整性
|
| 356 |
+
- 保护代码块完整性
|
| 357 |
+
"""
|
| 358 |
+
|
| 359 |
+
def __init__(
|
| 360 |
+
self,
|
| 361 |
+
chunk_size: int = config.CHUNK_SIZE,
|
| 362 |
+
chunk_overlap: int = config.CHUNK_OVERLAP,
|
| 363 |
+
separators: Optional[List[str]] = None,
|
| 364 |
+
):
|
| 365 |
+
self.chunk_size = chunk_size
|
| 366 |
+
self.chunk_overlap = chunk_overlap
|
| 367 |
+
self.separators = separators or config.SEPARATORS
|
| 368 |
+
|
| 369 |
+
self._splitter = RecursiveCharacterTextSplitter(
|
| 370 |
+
chunk_size=chunk_size,
|
| 371 |
+
chunk_overlap=chunk_overlap,
|
| 372 |
+
separators=self.separators,
|
| 373 |
+
add_start_index=True,
|
| 374 |
+
length_function=len,
|
| 375 |
+
keep_separator=True,
|
| 376 |
+
strip_whitespace=True,
|
| 377 |
+
)
|
| 378 |
+
|
| 379 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 380 |
+
"""分割文档列表"""
|
| 381 |
+
if not documents:
|
| 382 |
+
return []
|
| 383 |
+
|
| 384 |
+
chunks = self._splitter.split_documents(documents)
|
| 385 |
+
logger.info(
|
| 386 |
+
f"文本分割: {len(documents)} → {len(chunks)} 个文本块 "
|
| 387 |
+
f"(块大小={self.chunk_size}, 重叠={self.chunk_overlap})"
|
| 388 |
+
)
|
| 389 |
+
return chunks
|
| 390 |
+
|
| 391 |
+
def split_text(self, text: str, metadata: Optional[dict] = None) -> List[Document]:
|
| 392 |
+
"""分割单个文本"""
|
| 393 |
+
return self._splitter.create_documents(
|
| 394 |
+
[text], metadatas=[metadata or {}]
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
|
| 398 |
+
class MarkdownAwareSplitter:
|
| 399 |
+
"""
|
| 400 |
+
Markdown 感知分割器
|
| 401 |
+
|
| 402 |
+
在 Markdown 结构边界处分割:
|
| 403 |
+
- ## 标题 → 新段
|
| 404 |
+
- 表格 → 保持完整
|
| 405 |
+
- 代码块 → 保持完整
|
| 406 |
+
"""
|
| 407 |
+
|
| 408 |
+
def __init__(
|
| 409 |
+
self,
|
| 410 |
+
target_chunk_size: int = config.CHUNK_SIZE,
|
| 411 |
+
min_chunk_size: int = 100,
|
| 412 |
+
):
|
| 413 |
+
self.target_chunk_size = target_chunk_size
|
| 414 |
+
self.min_chunk_size = min_chunk_size
|
| 415 |
+
|
| 416 |
+
def split_documents(self, documents: List[Document]) -> List[Document]:
|
| 417 |
+
"""基于 Markdown 结构分割"""
|
| 418 |
+
all_chunks = []
|
| 419 |
+
|
| 420 |
+
for doc in documents:
|
| 421 |
+
sections = self._split_by_headers(doc.page_content)
|
| 422 |
+
chunks = self._merge_sections(
|
| 423 |
+
sections, doc.metadata, self.target_chunk_size, self.min_chunk_size
|
| 424 |
+
)
|
| 425 |
+
all_chunks.extend(chunks)
|
| 426 |
+
|
| 427 |
+
logger.info(
|
| 428 |
+
f"Markdown 感知分割: {len(documents)} → {len(all_chunks)} 个文本块"
|
| 429 |
+
)
|
| 430 |
+
return all_chunks
|
| 431 |
+
|
| 432 |
+
@staticmethod
|
| 433 |
+
def _split_by_headers(text: str) -> List[str]:
|
| 434 |
+
"""
|
| 435 |
+
按 Markdown 标题 (# ## ###) 和段落分割
|
| 436 |
+
保护表格和代码块完整性
|
| 437 |
+
"""
|
| 438 |
+
# 先在代码块和表格处做保护标记
|
| 439 |
+
protected = []
|
| 440 |
+
protection_map = {}
|
| 441 |
+
|
| 442 |
+
def protect(match):
|
| 443 |
+
key = f"__PROTECTED_{len(protected)}__"
|
| 444 |
+
protected.append(match.group(0))
|
| 445 |
+
protection_map[key] = match.group(0)
|
| 446 |
+
return key
|
| 447 |
+
|
| 448 |
+
# 保护代码块
|
| 449 |
+
text = re.sub(r'```[\s\S]*?```', protect, text)
|
| 450 |
+
# 保护表格 (连续的 | 行)
|
| 451 |
+
text = re.sub(
|
| 452 |
+
r'(?:^\|.+\|\n)+(?:^\|[\s\-:]+\|\n)?(?:^\|.+\|\n?)+',
|
| 453 |
+
protect,
|
| 454 |
+
text,
|
| 455 |
+
flags=re.MULTILINE,
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
# 按 Markdown 标题分割
|
| 459 |
+
raw_sections = re.split(r'\n(?=#{1,3}\s)', text)
|
| 460 |
+
|
| 461 |
+
# 恢复保护的内容
|
| 462 |
+
sections = []
|
| 463 |
+
for section in raw_sections:
|
| 464 |
+
for key, original in protection_map.items():
|
| 465 |
+
section = section.replace(key, original)
|
| 466 |
+
section = section.strip()
|
| 467 |
+
if section:
|
| 468 |
+
sections.append(section)
|
| 469 |
+
|
| 470 |
+
return sections
|
| 471 |
+
|
| 472 |
+
@staticmethod
|
| 473 |
+
def _merge_sections(
|
| 474 |
+
sections: List[str],
|
| 475 |
+
base_metadata: dict,
|
| 476 |
+
target_size: int,
|
| 477 |
+
min_size: int,
|
| 478 |
+
) -> List[Document]:
|
| 479 |
+
"""将段落合并为目标大小的块"""
|
| 480 |
+
chunks = []
|
| 481 |
+
current = ""
|
| 482 |
+
start_idx = 0
|
| 483 |
+
|
| 484 |
+
for i, section in enumerate(sections):
|
| 485 |
+
if not current:
|
| 486 |
+
current = section
|
| 487 |
+
start_idx = i
|
| 488 |
+
elif len(current) + len(section) + 2 <= target_size:
|
| 489 |
+
current += "\n\n" + section
|
| 490 |
+
else:
|
| 491 |
+
if len(current) >= min_size:
|
| 492 |
+
meta = {
|
| 493 |
+
**base_metadata,
|
| 494 |
+
"chunk_sections": f"{start_idx}-{i - 1}",
|
| 495 |
+
"chunk_type": "markdown_semantic",
|
| 496 |
+
}
|
| 497 |
+
chunks.append(Document(page_content=current, metadata=meta))
|
| 498 |
+
current = section
|
| 499 |
+
start_idx = i
|
| 500 |
+
|
| 501 |
+
# 最后一个块
|
| 502 |
+
if current and len(current) >= min_size:
|
| 503 |
+
meta = {
|
| 504 |
+
**base_metadata,
|
| 505 |
+
"chunk_sections": f"{start_idx}-{len(sections) - 1}",
|
| 506 |
+
"chunk_type": "markdown_semantic",
|
| 507 |
+
}
|
| 508 |
+
chunks.append(Document(page_content=current, metadata=meta))
|
| 509 |
+
elif current and chunks:
|
| 510 |
+
chunks[-1].page_content += "\n\n" + current
|
| 511 |
+
|
| 512 |
+
return chunks
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
# ============================================================
|
| 516 |
+
# 完整处理流水线
|
| 517 |
+
# ============================================================
|
| 518 |
+
|
| 519 |
+
class TextProcessingPipeline:
|
| 520 |
+
"""
|
| 521 |
+
文本处理流水线
|
| 522 |
+
|
| 523 |
+
用法:
|
| 524 |
+
pipeline = TextProcessingPipeline()
|
| 525 |
+
chunks = pipeline.process(raw_documents)
|
| 526 |
+
"""
|
| 527 |
+
|
| 528 |
+
def __init__(
|
| 529 |
+
self,
|
| 530 |
+
chunk_size: int = config.CHUNK_SIZE,
|
| 531 |
+
chunk_overlap: int = config.CHUNK_OVERLAP,
|
| 532 |
+
split_method: str = "recursive",
|
| 533 |
+
extract_tables: bool = True,
|
| 534 |
+
extract_formulas: bool = False,
|
| 535 |
+
clean_text: bool = True,
|
| 536 |
+
):
|
| 537 |
+
self.chunk_size = chunk_size
|
| 538 |
+
self.chunk_overlap = chunk_overlap
|
| 539 |
+
self.split_method = split_method
|
| 540 |
+
self.extract_tables = extract_tables
|
| 541 |
+
self.extract_formulas = extract_formulas
|
| 542 |
+
self.clean_text = clean_text
|
| 543 |
+
|
| 544 |
+
if split_method == "markdown":
|
| 545 |
+
self.splitter = MarkdownAwareSplitter(
|
| 546 |
+
target_chunk_size=chunk_size,
|
| 547 |
+
min_chunk_size=max(50, chunk_size // 4),
|
| 548 |
+
)
|
| 549 |
+
else:
|
| 550 |
+
self.splitter = DocumentSplitter(
|
| 551 |
+
chunk_size=chunk_size,
|
| 552 |
+
chunk_overlap=chunk_overlap,
|
| 553 |
+
)
|
| 554 |
+
|
| 555 |
+
def process(self, documents: List[Document]) -> List[Document]:
|
| 556 |
+
"""
|
| 557 |
+
完整处理流水线:
|
| 558 |
+
原始文档 → 清洗 → 提取表格/公式 → 分割 → 最终块
|
| 559 |
+
"""
|
| 560 |
+
docs = list(documents)
|
| 561 |
+
logger.info(f"文本处理流水线启动: {len(docs)} 个原始文档")
|
| 562 |
+
|
| 563 |
+
# Step 1: 文本清洗
|
| 564 |
+
if self.clean_text:
|
| 565 |
+
docs = MarkdownTextCleaner.clean_documents(docs)
|
| 566 |
+
|
| 567 |
+
# Step 2: 提取表格和公式为独立块
|
| 568 |
+
extra_docs = []
|
| 569 |
+
if self.extract_tables:
|
| 570 |
+
extra_docs.extend(MarkdownTextCleaner.extract_tables_as_chunks(docs))
|
| 571 |
+
if self.extract_formulas:
|
| 572 |
+
extra_docs.extend(MarkdownTextCleaner.extract_formulas_as_chunks(docs))
|
| 573 |
+
|
| 574 |
+
# Step 3: 分割
|
| 575 |
+
chunks = self.splitter.split_documents(docs)
|
| 576 |
+
|
| 577 |
+
# Step 4: 合并特殊内容块
|
| 578 |
+
if extra_docs:
|
| 579 |
+
chunks.extend(extra_docs)
|
| 580 |
+
logger.info(f"合并特殊块后总计: {len(chunks)} 个文本块")
|
| 581 |
+
|
| 582 |
+
# Step 5: 添加块 ID
|
| 583 |
+
for i, chunk in enumerate(chunks):
|
| 584 |
+
chunk.metadata["chunk_id"] = f"chunk_{i:06d}"
|
| 585 |
+
|
| 586 |
+
logger.info(f"文本处理完成: {len(documents)} 页 → {len(chunks)} 个文本块")
|
| 587 |
+
return chunks
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
# ============================================================
|
| 591 |
+
# 便捷函数
|
| 592 |
+
# ============================================================
|
| 593 |
+
|
| 594 |
+
def process_documents(
|
| 595 |
+
documents: List[Document],
|
| 596 |
+
chunk_size: int = config.CHUNK_SIZE,
|
| 597 |
+
chunk_overlap: int = config.CHUNK_OVERLAP,
|
| 598 |
+
**kwargs,
|
| 599 |
+
) -> List[Document]:
|
| 600 |
+
"""便捷函数: 一键文本处理"""
|
| 601 |
+
pipeline = TextProcessingPipeline(
|
| 602 |
+
chunk_size=chunk_size,
|
| 603 |
+
chunk_overlap=chunk_overlap,
|
| 604 |
+
**kwargs,
|
| 605 |
+
)
|
| 606 |
+
return pipeline.process(documents)
|
vector_store.py
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
============================================================
|
| 3 |
+
向量数据库存储模块
|
| 4 |
+
============================================================
|
| 5 |
+
嵌入模型: Qwen3-Embedding 系列
|
| 6 |
+
向量数据库: Chroma / FAISS
|
| 7 |
+
|
| 8 |
+
功能:
|
| 9 |
+
1. 文档批量向量化入库
|
| 10 |
+
2. 相似度检索 / MMR / 元数据过滤
|
| 11 |
+
3. 持久化与增量更新
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import List, Optional, Dict, Any, Callable
|
| 16 |
+
|
| 17 |
+
from langchain_core.documents import Document
|
| 18 |
+
from langchain_core.embeddings import Embeddings
|
| 19 |
+
from langchain_core.vectorstores import VectorStore
|
| 20 |
+
|
| 21 |
+
from langchain_community.vectorstores import Chroma, FAISS
|
| 22 |
+
|
| 23 |
+
from loguru import logger
|
| 24 |
+
|
| 25 |
+
import config
|
| 26 |
+
from embeddings import get_embedding_model
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ============================================================
|
| 30 |
+
# 向量数据库工厂
|
| 31 |
+
# ============================================================
|
| 32 |
+
|
| 33 |
+
class VectorStoreFactory:
|
| 34 |
+
|
| 35 |
+
@staticmethod
|
| 36 |
+
def create_chroma(
|
| 37 |
+
persist_directory: Optional[str | Path] = None,
|
| 38 |
+
collection_name: str = config.CHROMA_COLLECTION_NAME,
|
| 39 |
+
embedding_function: Optional[Embeddings] = None,
|
| 40 |
+
) -> Chroma:
|
| 41 |
+
persist_dir = str(persist_directory or config.VECTOR_DB_DIR / "chroma")
|
| 42 |
+
embedding = embedding_function or get_embedding_model()
|
| 43 |
+
|
| 44 |
+
logger.info(f"创建 Chroma 向量数据库: {persist_dir} (集合: {collection_name})")
|
| 45 |
+
|
| 46 |
+
return Chroma(
|
| 47 |
+
collection_name=collection_name,
|
| 48 |
+
embedding_function=embedding,
|
| 49 |
+
persist_directory=persist_dir,
|
| 50 |
+
collection_metadata={
|
| 51 |
+
"hnsw:space": "cosine", # Qwen3-Embedding 使用余弦相似度
|
| 52 |
+
"hnsw:construction_ef": 200,
|
| 53 |
+
"hnsw:M": 48,
|
| 54 |
+
},
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
@staticmethod
|
| 58 |
+
def create_faiss(
|
| 59 |
+
embedding_function: Optional[Embeddings] = None,
|
| 60 |
+
) -> FAISS:
|
| 61 |
+
embedding = embedding_function or get_embedding_model()
|
| 62 |
+
logger.info("创建 FAISS 向量数据库 (flat L2 index)")
|
| 63 |
+
# FAISS.from_documents 会创建合适的索引
|
| 64 |
+
return FAISS(
|
| 65 |
+
embedding_function=embedding,
|
| 66 |
+
index=None,
|
| 67 |
+
docstore=None,
|
| 68 |
+
index_to_docstore_id={},
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
@staticmethod
|
| 72 |
+
def create(store_type: Optional[str] = None, **kwargs) -> VectorStore:
|
| 73 |
+
store_type = store_type or config.VECTOR_STORE_TYPE
|
| 74 |
+
if store_type == "chroma":
|
| 75 |
+
return VectorStoreFactory.create_chroma(**kwargs)
|
| 76 |
+
elif store_type == "faiss":
|
| 77 |
+
return VectorStoreFactory.create_faiss(**kwargs)
|
| 78 |
+
else:
|
| 79 |
+
raise ValueError(f"不支持的向量数据库: {store_type}. 可选: chroma, faiss")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ============================================================
|
| 83 |
+
# 向量数据库管理器
|
| 84 |
+
# ============================================================
|
| 85 |
+
|
| 86 |
+
class VectorStoreManager:
|
| 87 |
+
|
| 88 |
+
def __init__(
|
| 89 |
+
self,
|
| 90 |
+
vector_store: Optional[VectorStore] = None,
|
| 91 |
+
store_type: Optional[str] = None,
|
| 92 |
+
embedding_function: Optional[Embeddings] = None,
|
| 93 |
+
persist_directory: Optional[str | Path] = None,
|
| 94 |
+
):
|
| 95 |
+
self.store_type = store_type or config.VECTOR_STORE_TYPE
|
| 96 |
+
self.embedding_function = embedding_function or get_embedding_model()
|
| 97 |
+
self.persist_directory = str(persist_directory or config.VECTOR_DB_DIR)
|
| 98 |
+
self._store = vector_store or self._init_store()
|
| 99 |
+
|
| 100 |
+
def _init_store(self) -> VectorStore:
|
| 101 |
+
if self.store_type == "chroma":
|
| 102 |
+
return self._init_chroma()
|
| 103 |
+
elif self.store_type == "faiss":
|
| 104 |
+
return self._init_faiss()
|
| 105 |
+
else:
|
| 106 |
+
raise ValueError(f"不支持的向量数据库: {self.store_type}")
|
| 107 |
+
|
| 108 |
+
def _init_chroma(self) -> Chroma:
|
| 109 |
+
persist_dir = Path(self.persist_directory) / "chroma"
|
| 110 |
+
if persist_dir.exists() and any(persist_dir.iterdir()):
|
| 111 |
+
logger.info(f"加载已有 Chroma 数据库: {persist_dir}")
|
| 112 |
+
return Chroma(
|
| 113 |
+
persist_directory=str(persist_dir),
|
| 114 |
+
embedding_function=self.embedding_function,
|
| 115 |
+
collection_name=config.CHROMA_COLLECTION_NAME,
|
| 116 |
+
)
|
| 117 |
+
return VectorStoreFactory.create_chroma(
|
| 118 |
+
persist_directory=str(persist_dir),
|
| 119 |
+
embedding_function=self.embedding_function,
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
def _init_faiss(self) -> FAISS:
|
| 123 |
+
index_path = Path(self.persist_directory) / "faiss_index"
|
| 124 |
+
if index_path.exists():
|
| 125 |
+
logger.info(f"加载已有 FAISS 数据库: {index_path}")
|
| 126 |
+
return FAISS.load_local(
|
| 127 |
+
str(index_path),
|
| 128 |
+
self.embedding_function,
|
| 129 |
+
allow_dangerous_deserialization=True,
|
| 130 |
+
)
|
| 131 |
+
return VectorStoreFactory.create_faiss(
|
| 132 |
+
embedding_function=self.embedding_function,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def store(self) -> VectorStore:
|
| 137 |
+
return self._store
|
| 138 |
+
|
| 139 |
+
# ---- 入库 ----
|
| 140 |
+
|
| 141 |
+
def add_documents(
|
| 142 |
+
self,
|
| 143 |
+
documents: List[Document],
|
| 144 |
+
batch_size: int = 50,
|
| 145 |
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
| 146 |
+
) -> int:
|
| 147 |
+
if not documents:
|
| 148 |
+
logger.warning("文档列表为空, 跳过入库")
|
| 149 |
+
return 0
|
| 150 |
+
|
| 151 |
+
total = len(documents)
|
| 152 |
+
logger.info(f"开始向量化入库: {total} 个文档块 (批大小={batch_size})")
|
| 153 |
+
|
| 154 |
+
for i in range(0, total, batch_size):
|
| 155 |
+
batch = documents[i : i + batch_size]
|
| 156 |
+
self._store.add_documents(batch)
|
| 157 |
+
if progress_callback:
|
| 158 |
+
progress_callback(min(i + batch_size, total), total)
|
| 159 |
+
|
| 160 |
+
self._persist()
|
| 161 |
+
logger.info(f"向量化入库完成: {total} 个文档块")
|
| 162 |
+
return total
|
| 163 |
+
|
| 164 |
+
def add_texts(
|
| 165 |
+
self,
|
| 166 |
+
texts: List[str],
|
| 167 |
+
metadatas: Optional[List[dict]] = None,
|
| 168 |
+
batch_size: int = 50,
|
| 169 |
+
) -> List[str]:
|
| 170 |
+
if not texts:
|
| 171 |
+
return []
|
| 172 |
+
all_ids = []
|
| 173 |
+
for i in range(0, len(texts), batch_size):
|
| 174 |
+
batch_texts = texts[i : i + batch_size]
|
| 175 |
+
batch_metas = metadatas[i : i + batch_size] if metadatas else None
|
| 176 |
+
ids = self._store.add_texts(batch_texts, batch_metas)
|
| 177 |
+
all_ids.extend(ids)
|
| 178 |
+
self._persist()
|
| 179 |
+
return all_ids
|
| 180 |
+
|
| 181 |
+
# ---- 检索 ----
|
| 182 |
+
|
| 183 |
+
def similarity_search(
|
| 184 |
+
self,
|
| 185 |
+
query: str,
|
| 186 |
+
k: int = config.RETRIEVAL_TOP_K,
|
| 187 |
+
filter: Optional[Dict[str, Any]] = None,
|
| 188 |
+
**kwargs,
|
| 189 |
+
) -> List[Document]:
|
| 190 |
+
if filter and isinstance(self._store, Chroma):
|
| 191 |
+
kwargs["filter"] = filter
|
| 192 |
+
return self._store.similarity_search(query, k=k, **kwargs)
|
| 193 |
+
|
| 194 |
+
def similarity_search_with_score(
|
| 195 |
+
self,
|
| 196 |
+
query: str,
|
| 197 |
+
k: int = config.RETRIEVAL_TOP_K,
|
| 198 |
+
filter: Optional[Dict[str, Any]] = None,
|
| 199 |
+
score_threshold: float = 0.3,
|
| 200 |
+
**kwargs,
|
| 201 |
+
) -> List[tuple]:
|
| 202 |
+
if filter and isinstance(self._store, Chroma):
|
| 203 |
+
kwargs["filter"] = filter
|
| 204 |
+
raw = self._store.similarity_search_with_relevance_scores(
|
| 205 |
+
query, k=k, **kwargs
|
| 206 |
+
)
|
| 207 |
+
# Qwen3-Embedding 余弦相似度通常 > 0.5 为相关
|
| 208 |
+
return [(doc, score) for doc, score in raw if score >= score_threshold]
|
| 209 |
+
|
| 210 |
+
def max_marginal_relevance_search(
|
| 211 |
+
self,
|
| 212 |
+
query: str,
|
| 213 |
+
k: int = config.RETRIEVAL_TOP_K,
|
| 214 |
+
fetch_k: int = 20,
|
| 215 |
+
lambda_mult: float = 0.5,
|
| 216 |
+
filter: Optional[Dict[str, Any]] = None,
|
| 217 |
+
) -> List[Document]:
|
| 218 |
+
if filter and isinstance(self._store, Chroma):
|
| 219 |
+
return self._store.max_marginal_relevance_search(
|
| 220 |
+
query, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, filter=filter,
|
| 221 |
+
)
|
| 222 |
+
return self._store.max_marginal_relevance_search(
|
| 223 |
+
query, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
# ---- 过滤查询 ----
|
| 227 |
+
|
| 228 |
+
def search_by_document(
|
| 229 |
+
self, query: str, document_name: str, k: int = config.RETRIEVAL_TOP_K
|
| 230 |
+
) -> List[Document]:
|
| 231 |
+
return self.similarity_search(query, k=k, filter={"document_name": document_name})
|
| 232 |
+
|
| 233 |
+
def search_by_page_range(
|
| 234 |
+
self, query: str, start_page: int, end_page: int,
|
| 235 |
+
k: int = config.RETRIEVAL_TOP_K,
|
| 236 |
+
) -> List[Document]:
|
| 237 |
+
return self.similarity_search(
|
| 238 |
+
query, k=k, filter={"page": {"$gte": start_page, "$lte": end_page}}
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# ---- 管理 ----
|
| 242 |
+
|
| 243 |
+
def _persist(self):
|
| 244 |
+
if self.store_type == "faiss":
|
| 245 |
+
index_path = Path(self.persist_directory) / "faiss_index"
|
| 246 |
+
index_path.mkdir(parents=True, exist_ok=True)
|
| 247 |
+
self._store.save_local(str(index_path))
|
| 248 |
+
|
| 249 |
+
def clear(self):
|
| 250 |
+
if self.store_type == "chroma":
|
| 251 |
+
self._store.delete_collection()
|
| 252 |
+
self._store = VectorStoreFactory.create_chroma(
|
| 253 |
+
persist_directory=Path(self.persist_directory) / "chroma",
|
| 254 |
+
embedding_function=self.embedding_function,
|
| 255 |
+
)
|
| 256 |
+
elif self.store_type == "faiss":
|
| 257 |
+
self._store = VectorStoreFactory.create_faiss(
|
| 258 |
+
embedding_function=self.embedding_function,
|
| 259 |
+
)
|
| 260 |
+
logger.info("向量数据库已清空")
|
| 261 |
+
|
| 262 |
+
def get_document_count(self) -> int:
|
| 263 |
+
try:
|
| 264 |
+
if self.store_type == "chroma":
|
| 265 |
+
return self._store._collection.count()
|
| 266 |
+
elif self.store_type == "faiss":
|
| 267 |
+
return self._store.index.ntotal if self._store.index else 0
|
| 268 |
+
except Exception:
|
| 269 |
+
return 0
|
| 270 |
+
|
| 271 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 272 |
+
return {
|
| 273 |
+
"store_type": self.store_type,
|
| 274 |
+
"persist_directory": self.persist_directory,
|
| 275 |
+
"document_count": self.get_document_count(),
|
| 276 |
+
"embedding_model": config.EMBEDDING_MODEL_NAME,
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
# ============================================================
|
| 281 |
+
# 便捷函数
|
| 282 |
+
# ============================================================
|
| 283 |
+
|
| 284 |
+
def build_vector_store(
|
| 285 |
+
documents: List[Document],
|
| 286 |
+
store_type: Optional[str] = None,
|
| 287 |
+
embedding_model: Optional[Embeddings] = None,
|
| 288 |
+
clear_existing: bool = False,
|
| 289 |
+
) -> VectorStoreManager:
|
| 290 |
+
manager = VectorStoreManager(
|
| 291 |
+
store_type=store_type,
|
| 292 |
+
embedding_function=embedding_model,
|
| 293 |
+
)
|
| 294 |
+
if clear_existing:
|
| 295 |
+
manager.clear()
|
| 296 |
+
manager.add_documents(documents)
|
| 297 |
+
return manager
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def load_vector_store(
|
| 301 |
+
store_type: Optional[str] = None,
|
| 302 |
+
embedding_model: Optional[Embeddings] = None,
|
| 303 |
+
) -> VectorStoreManager:
|
| 304 |
+
return VectorStoreManager(
|
| 305 |
+
store_type=store_type,
|
| 306 |
+
embedding_function=embedding_model,
|
| 307 |
+
)
|
国药准字H37020386_布洛芬片.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22954cb51781e685a42d5dd1abac0bde98906af75a6097871e3a937bdeaa4cdf
|
| 3 |
+
size 125946
|