H022329 commited on
Commit
92a1ebc
·
verified ·
1 Parent(s): df9fb03

Upload folder using huggingface_hub

Browse files
.claude/settings.local.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "WebSearch",
5
+ "Bash(/data/huangjie/miniforge3/envs/cad/bin/python --version)",
6
+ "Bash(/data/huangjie/miniforge3/envs/cad/bin/pip show *)",
7
+ "Bash(timeout 5 /data/huangjie/miniforge3/envs/cad/bin/python -c \"import pymupdf; print\\('pymupdf OK, version:', pymupdf.version\\)\")",
8
+ "Bash(python:*)"
9
+ ]
10
+ }
11
+ }
.env.example CHANGED
@@ -1,47 +1,46 @@
 
 
1
 
2
- # # LLM API(OpenAI API格式)
3
- # EMBEDDING_MODEL_PATH = "/data/huangjie/.cache/modelscope/hub/models/Qwen/Qwen3-VL-Embedding-2B"
4
- EMBEDDING_API_BASE_URL = "http://10.126.102.211:8010/v1/"
5
- # EMBEDDING_API_KEY = "xxx"
6
- EMBEDDING_API_PORT = 8010
7
- EMBEDDING_MODEL_NAME = "AXERA-TECH/Qwen3-VL-Embedding-2B"
8
 
9
 
10
- # # VLM API(OpenAI API格式)
11
- # VLM_MODEL_PATH = "/data/huangjie/.cache/modelscope/hub/models/Qwen/Qwen3-VL-2B-Instruct"
12
- VLM_API_BASE_URL = "http://10.126.102.211:8011/v1/"
13
- # VLM_API_KEY = "xxxx"
14
- VLM_API_PORT = 8011
15
- VLM_MODEL_NAME = "AXERA-TECH/Qwen3-VL-2B-Instruct"
16
 
17
 
18
- # # LLM API(OpenAI API格式)
19
- # LLM_MODEL_PATH = "/data/huangjie/.cache/modelscope/hub/models/Qwen/Qwen3-VL-Embedding-2B"
20
- LLM_API_BASE_URL = "http://10.126.102.211:8012/v1/"
21
- # LLM_API_KEY = "xxx"
22
- LLM_API_PORT = 8012
23
- LLM_MODEL_NAME = "AXERA-TECH/Qwen3-1.7B"
24
 
25
 
26
- # # ASR API
27
- SHERPA_MODEL_DIR = "/root/huangjie/AXERA-TECH/SenseVoice"
28
- SHERPA_ASR_URL = "http://10.126.102.211:8013"
29
- # ASR_API_KEY = "xxx"
30
- SHERPA_ASR_API_PORT = 8013
31
- SHERPA_MODEL_FILE = "/root/huangjie/AXERA-TECH/SenseVoice/ax650/model-10-seconds.axmodel"
32
 
 
 
 
33
 
34
- # # Tokenizer API
35
- Tokenizer_MODEL_PATH = "./VideoAgent/_llm/tokenizer_model/Qwen/Qwen3-1.7B" # Tokenizer 文件夹(含tokenizer_config.json, tokenizer.json, vocab.json文件)
36
- Tokenizer_API_BASE_URL = "http://0.0.0.0:8014/"
37
- Tokenizer_API_KEY = "xxxx"
38
- Tokenizer_API_PORT = 8014
39
 
 
 
40
 
41
-
42
- VIDEORAG_VIDEO_SEGMENT_LENGTH = "10"
43
- VIDEORAG_ROUGH_NUM_FRAMES_PER_SEGMENT = "5"
44
- VIDEORAG_RETRIEVAL_TOPK_CHUNKS = "2"
45
- VIDEORAG_QUERY_BETTER_THAN_THRESHOLD = "0.2"
46
- VIDEORAG_CHUNK_TOKEN_SIZE = "800"
47
- VIDEORAG_SEGMENT_RETRIEVAL_TOP_K = "2"
 
1
+ # ============================================================
2
+ # 环境变量配置
3
 
4
+ # .env 配置
5
+ OCR_ENGINE=api # 改为 api 模式
6
+ OCR_API_BASE=http://127.0.0.1:8015/v1 # vLLM 服务地址
7
+ OCR_API_MODEL=AXERA-TECH/PaddleOCR-VL-1.5 # 模型名
8
+ OCR_API_KEY=not-needed
9
+ OCR_TASK=ocr # 任务类型
10
 
11
 
12
+ EMBEDDING_MODEL_NAME=AXERA-TECH/Qwen3-Embedding-0.6B
13
+ EMBEDDING_API_BASE=http://127.0.0.1:8014/v1
14
+ EMBEDDING_API_KEY=sk-08ab126e77f04a0c99bb30154ab0876f
15
+ EMBEDDING_BATCH_SIZE=4
 
 
16
 
17
 
18
+ LLM_API_BASE =http://127.0.0.1:8013/v1
19
+ LLM_API_KEY = not-needed
20
+ LLM_MODEL_NAME =AXERA-TECH/Qwen3-1.7B-GPTQ-Int4
21
+ LLM_TEMPERATURE=0.1
22
+ LLM_MAX_TOKENS=2048
 
23
 
24
 
25
+ # ---- 向量数据库 ----
26
+ VECTOR_STORE_TYPE=chroma
27
+ CHROMA_COLLECTION_NAME=pdf_ocr_knowledge
 
 
 
28
 
29
+ # ---- PDF 渲染 ----
30
+ PDF_RENDER_DPI=300
31
+ MAX_FILE_SIZE_MB=50
32
 
33
+ # ---- 文本分割与检索 ----
34
+ CHUNK_SIZE=800
35
+ CHUNK_OVERLAP=150
36
+ RETRIEVAL_TOP_K=5
 
37
 
38
+ # ---- 日志 ----
39
+ LOG_LEVEL=INFO
40
 
41
+ OCR_VL_BACKEND=native
42
+ OCR_USE_LAYOUT=false
43
+ OCR_LAYOUT_THRESHOLD=0.5
44
+ OCR_USE_CHART=false
45
+ OCR_MAX_NEW_TOKENS=4096
46
+ OCR_TEMPERATURE=0.0
 
.gitattributes CHANGED
@@ -44,3 +44,11 @@ image-10.png filter=lfs diff=lfs merge=lfs -text
44
  image-7.png filter=lfs diff=lfs merge=lfs -text
45
  image-8.png filter=lfs diff=lfs merge=lfs -text
46
  image-9.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
44
  image-7.png filter=lfs diff=lfs merge=lfs -text
45
  image-8.png filter=lfs diff=lfs merge=lfs -text
46
  image-9.png filter=lfs diff=lfs merge=lfs -text
47
+ assets/image-1.png filter=lfs diff=lfs merge=lfs -text
48
+ assets/image-13.png filter=lfs diff=lfs merge=lfs -text
49
+ assets/image-14.png filter=lfs diff=lfs merge=lfs -text
50
+ assets/image-16.png filter=lfs diff=lfs merge=lfs -text
51
+ assets/image.png filter=lfs diff=lfs merge=lfs -text
52
+ assets/OCR_RAG.mp4 filter=lfs diff=lfs merge=lfs -text
53
+ test.png filter=lfs diff=lfs merge=lfs -text
54
+ 国药准字H37020386_布洛芬片.pdf filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,2 +1,5 @@
1
  .env
2
- uphg.py
 
 
 
 
1
  .env
2
+ uphg.py
3
+ .claude/
4
+ .vscode/
5
+ __pycache__/
README.md CHANGED
@@ -1,40 +1,88 @@
1
- # VideoAgent — 视频理解分析(基于 AX650N)
2
 
3
- 基于 AX650N 芯片平台,构建多模态 VideoAgent面向视频理解与检索,支持长视频智能分析自然语言问答。
4
 
5
- <p align="center">
6
- <img src="https://img.shields.io/badge/platform-AX650N-blue" alt="Platform">
7
- <img src="https://img.shields.io/badge/python-3.10+-green" alt="Python">
8
-
9
- </p>
10
 
11
- ---
12
-
13
- ## 核心功能
 
 
14
 
15
- - **芯片平台部署** — 基于 AX650N 芯片部署全部模型,端到端运行完整流程
16
- - **视频智能索引** — 自动分段、特征提取、多模态信息融合(ASR + VLM)
17
- - **向量检索** — 高效相似度检索与结果融合,支持跨模态查询
18
- - **自然语言问答** — 用自然语言提问,基于视频内容生成回答
19
 
20
- ---
 
21
 
22
- ## 模型配置
23
 
24
- 基于 AX650N 芯片平台运行前,请下载以下模型并参照相关文档完成部署:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- | 模型类型 | 模型名称 | 说明 |
27
- |---------|---------|------|
28
- | **ASR** | [SenseVoiceSmall-axmodel](https://huggingface.co/M5Stack/SenseVoiceSmall-axmodel) | 多语言语音理解模型 |
29
- | **VLM** | [Qwen3-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/AXERA-TECH/Qwen3-VL-2B-Instruct-GPTQ-Int4) | 多模态视觉语言模型 |
30
- | **LLM** | [Qwen3-1.7B](https://huggingface.co/AXERA-TECH/Qwen3-1.7B) | 大语言模型 |
31
- | **Embedding** | [Qwen3-VL-Embedding-2B-AX650](https://huggingface.co/AXERA-TECH/Qwen3-VL-Embedding-2B-AX650-C128_P1280_CTX1407) | 多模态嵌入模型 |
32
 
33
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  ## 快速开始
36
 
37
- ### 1. 安装依赖
38
 
39
  ```bash
40
  pip install -r requirements.txt
@@ -42,7 +90,7 @@ pip install -r requirements.txt
42
 
43
  ### 2. 配置环境变量
44
 
45
- EmbeddingVLM、LLM、ASR、Tokenizer 均通过环境变量配置。其中 Embedding、VLM、LLM 兼容 OpenAI API 格式。
46
 
47
  ```bash
48
  cp .env.example .env
@@ -53,22 +101,24 @@ cp .env.example .env
53
 
54
  ```ini
55
  # LLM API(OpenAI API 格式)
56
- LLM_MODEL_PATH = "/data/huangjie/.cache/modelscope/hub/models/Qwen/Qwen3-VL-Embedding-2B"
57
- LLM_API_BASE_URL = "http://0.0.0.0:8012/v1/"
58
- LLM_API_KEY = "xxx"
59
- LLM_MODEL_NAME = "AXERA-TECH/Qwen3-1.7B"
60
- LLM_API_PORT = 8012
61
-
62
- # ASR API
63
- SHERPA_MODEL_DIR = "/root/huangjie/AXERA-TECH/SenseVoice"
64
- SHERPA_ASR_URL = "http://0.0.0.0:8013"
65
- SHERPA_ASR_API_PORT = 8013
66
- SHERPA_MODEL_FILE = "/root/huangjie/AXERA-TECH/SenseVoice/ax650/model-10-seconds.axmodel"
67
-
68
- # Tokenizer API
69
- Tokenizer_MODEL_PATH = "/root/huangjie/project/VideoAgent_api507/VideoAgent/_llm/tokenizer_model/Qwen/Qwen3-1.7B"
70
- Tokenizer_API_BASE_URL = "http://0.0.0.0:8014"
71
- Tokenizer_API_PORT = 8014
 
 
72
  ```
73
 
74
  ### 3. 启动模型服务
@@ -76,86 +126,158 @@ Tokenizer_API_PORT = 8014
76
  基于 AX650N 芯片启动各模型服务:
77
 
78
  ```bash
79
- # Embedding 服务 — 端口 8010
80
- axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-VL-Embedding-2B-AX650-C128_P1280_CTX1407 --port 8010
 
 
 
81
 
82
- # VLM 服务 — 端口 8011
83
- axllm serve /root/huangjie/AXERA-TECH/Qwen3-VL-2B-Instruct-GPTQ-Int4 --port 8011
 
84
 
85
- # LLM 服务 — 端口 8012
86
- axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-1.7B --port 8012
87
 
88
- # ASR 服务 — 端口 8013
89
- python VideoAgent/_server/sherpa_asr_server.py
90
 
91
- # Tokenizer 服务 — 端口 8014
92
- python VideoAgent/_server/tokenizer_server.py
93
  ```
94
 
95
- ### 4. 使用方式
96
 
97
- #### Web UI(推荐)
98
 
99
- ```bash
100
- python webui.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  ```
102
 
103
- 浏览器访问 **http://localhost:7869**
104
 
105
- | 索引界面 | 检索界面 |
106
- |---------|---------|
107
- | ![索引界面](image-5.png) | ![检索界面](image-4.png) |
 
 
 
 
 
 
 
108
 
109
- #### Python SDK
110
 
111
  ```python
112
- from VideoAgent import VideoRAG, QueryParam
 
 
 
 
 
 
 
113
 
114
- # 初始化 RAG 系统
115
- rag = VideoRAG(working_dir="./working_dir")
 
116
 
117
- # 索引视频文件
118
- rag.insert_video(video_path_list=["video1.mp4", "video2.mp4"])
119
 
120
- # 查询视频内容
121
- result = rag.query(query="视频中什么时候出现张飞?", param=QueryParam())
122
- print(result)
123
  ```
124
 
125
  ---
126
 
127
- ## 工作流程
128
 
129
- ### 视频索引流程
130
 
131
- ![索引流程](image-2.png)
132
- ### 查询流程
133
 
 
134
 
135
- ![查询流程](image-3.png)
136
 
 
137
 
138
- ---
139
 
140
- ## 项目结构
141
 
142
- ```
143
- VideoAgent-AX650N/
144
- ├── VideoAgent/ # 核心包
145
- │ ├── _llm/ # 模型定义层
146
- │ ├── _server/ # 服务层(FastAPI)
147
- │ ├── _storage/ # 存储层
148
- │ ├── _videoutil/ # 视频处理工具
149
- │ └── vidrag_pipeline.py # 核心管道
150
- ├── working_dir/ # 运行时数据目录
151
- ├── webui.py # Gradio Web 入口
152
- ├── videorag_longervideos.py # 测试脚本
153
- └── README.md # 项目文档
154
- ```
155
 
156
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- ## 参考项目
159
 
160
- - 香港大学数据科学实验室HKUDS [VideoRAG](https://github.com/HKUDS/VideoRAG)超长视频跨模态检索增强生成框架
161
 
 
 
1
+ # 基于 OCR + RAG 的文档智能问答系统
2
 
3
+ 基于 **PaddleOCR-VL** + **Qwen3-Embedding** + **Qwen3** + **LangChain RAG** 的文档智能问答系统,支持 PDF、扫描件及常见图片格式的端到端识别检索问答。
4
 
5
+ ## 模型栈
 
 
 
 
6
 
7
+ | 模型类型 | 模型名称 | 说明 |
8
+ |---------|---------|------|
9
+ | **OCR** | [PaddleOCR-VL-1.5](https://huggingface.co/AXERA-TECH/PaddleOCR-VL-1.5) | OCR 识别模型 |
10
+ | **LLM** | [Qwen3-1.7B](https://huggingface.co/AXERA-TECH/Qwen3-1.7B-GPTQ-Int4) | 大语言模型 |
11
+ | **Embedding** | [Qwen3-Embedding-0.6B](https://huggingface.co/AXERA-TECH/Qwen3-Embedding-0.6B) | 文本嵌入模型 |
12
 
13
+ ## 支持格式
 
 
 
14
 
15
+ - PDF(文字型 / 扫描版)
16
+ - PNG / JPG / JPEG / BMP / TIF / TIFF
17
 
18
+ ## 架构
19
 
20
+ ```
21
+ 文件上传 (PDF/PNG/JPG/BMP/TIF)
22
+
23
+
24
+ ┌─────────────────────────────────┐
25
+ │ PaddleOCR-VL │
26
+ │ 端到端识别: 文本 + 版面 + 表格 │
27
+ │ 输出: Markdown / JSON │
28
+ └──────────────┬──────────────────┘
29
+ │ LangChain Documents
30
+
31
+ ┌─────────────────────────────────┐
32
+ │ 文本处理 │
33
+ │ Markdown 清洗 → 语义感知分割 │
34
+ │ 表格/公式 独立提取 │
35
+ └──────────────┬──────────────────┘
36
+ │ Document Chunks
37
+
38
+ ┌─────────────────────────────────┐
39
+ │ Qwen3-Embedding │
40
+ │ instruct-aware 向量嵌入 │
41
+ └──────────────┬──────────────────┘
42
+ │ Vector Embeddings
43
+
44
+ ┌─────────────────────────────────┐
45
+ │ Chroma / FAISS 向量数据库 │
46
+ │ 相似度检索 / MMR / 元数据过滤 │
47
+ └──────────────┬──────────────────┘
48
+ │ Top-K 相关文档
49
+
50
+ ┌─────────────────────────────────┐
51
+ │ Qwen3-1.7B │
52
+ │ LangChain LCEL RAG 链 │
53
+ │ 多轮对话 + 来源引用 │
54
+ └──────────────┬──────────────────┘
55
+
56
+
57
+ ┌─────────────────────────────────┐
58
+ │ Web UI (Gradio) │
59
+ │ 上传 | 问答 | 来源 | 状态 │
60
+ └─────────────────────────────────┘
61
+ ```
62
 
63
+ ## 项目结构
 
 
 
 
 
64
 
65
+ ```
66
+ pdfocr/
67
+ ├── requirements.txt # Python 依赖
68
+ ├── .env.example # 环境变量模板
69
+ ├── config.py # 全局配置中心
70
+ ├── ocr_loader.py # PaddleOCR-VL 加载器 (支持多格式)
71
+ ├── text_processor.py # Markdown 清洗 + 智能分割
72
+ ├── embeddings.py # Qwen3-Embedding 向量嵌入
73
+ ├── vector_store.py # 向量数据库管理 (Chroma/FAISS)
74
+ ├── rag_chain.py # RAG 问答链 (Qwen3)
75
+ ├── app.py # Web UI
76
+ └── data/ # 运行时数据
77
+ ├── uploads/
78
+ ├── ocr_output/
79
+ ├── vector_db/
80
+ └── logs/
81
+ ```
82
 
83
  ## 快速开始
84
 
85
+ ### 1. 环境准备
86
 
87
  ```bash
88
  pip install -r requirements.txt
 
90
 
91
  ### 2. 配置环境变量
92
 
93
+ OCR、LLM、Embedding 均通过环境变量配置兼容 OpenAI API 格式。
94
 
95
  ```bash
96
  cp .env.example .env
 
101
 
102
  ```ini
103
  # LLM API(OpenAI API 格式)
104
+ LLM_API_KEY=not-needed
105
+ LLM_API_BASE=http://127.0.0.1:8013/v1
106
+ LLM_MODEL_NAME=AXERA-TECH/Qwen3-1.7B-GPTQ-Int4
107
+ LLM_TEMPERATURE=0.1
108
+ LLM_MAX_TOKENS=2048
109
+
110
+ # Embedding API
111
+ EMBEDDING_MODEL_NAME=AXERA-TECH/Qwen3-Embedding-0.6B
112
+ EMBEDDING_API_BASE=http://127.0.0.1:8014/v1
113
+ EMBEDDING_API_KEY=not-needed
114
+ EMBEDDING_BATCH_SIZE=4
115
+
116
+ # OCR API
117
+ OCR_ENGINE=api
118
+ OCR_API_BASE=http://127.0.0.1:8015/v1
119
+ OCR_API_MODEL=AXERA-TECH/PaddleOCR-VL-1.5
120
+ OCR_API_KEY=not-needed
121
+ OCR_TASK=ocr
122
  ```
123
 
124
  ### 3. 启动模型服务
 
126
  基于 AX650N 芯片启动各模型服务:
127
 
128
  ```bash
129
+ # LLM 服务 — 端口 8013
130
+ axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-1.7B --port 8013
131
+
132
+ # Embedding 服务 — 端口 8014
133
+ axllm serve /root/huangjie/AXERA-TECH/models--AXERA-TECH--Qwen3-Embedding-0.6B --port 8014
134
 
135
+ # OCR 服务 — 端口 8015
136
+ axllm serve /root/huangjie/AXERA-TECH/PaddleOCR-VL-1.5 --port 8015
137
+ ```
138
 
139
+ ## 使用方式
 
140
 
141
+ ### 1. Web UI(推荐)
 
142
 
143
+ ```bash
144
+ python app.py
145
  ```
146
 
147
+ 浏览器访问 **http://localhost:7860**
148
 
149
+ **问答界面**
150
 
151
+ ![alt text](assets/image-16.png)
152
+
153
+ **预览界面**
154
+
155
+ ![alt text](assets/image-1.png)
156
+
157
+ **设置界面**
158
+
159
+ ![alt text](assets/image.png)
160
+
161
+ ### 2. Python API
162
+
163
+ ```python
164
+ from rag_chain import PDFRAGPipeline
165
+
166
+ # 初始化流水线
167
+ pipeline = PDFRAGPipeline()
168
+
169
+ # 处理文档 (支持 PDF/PNG/JPG/BMP/TIF)
170
+ pipeline.ingest("document.pdf")
171
+ pipeline.ingest("scan.png")
172
+
173
+ # 问答
174
+ result = pipeline.ask("文档主要内容是什么?")
175
+ print(result["answer"])
176
+ print(result["sources"])
177
+
178
+ # 多轮对话
179
+ result = pipeline.ask_with_history(
180
+ "那第二章呢?",
181
+ chat_history=[
182
+ {"role": "user", "content": "文档主要讲什么?"},
183
+ {"role": "assistant", "content": "文档主要介绍了..."},
184
+ ]
185
+ )
186
+
187
+ # 流式输出
188
+ for chunk in pipeline.ask_stream("请总结文档"):
189
+ print(chunk, end="", flush=True)
190
  ```
191
 
192
+ ### 3. 命令行
193
 
194
+ ```bash
195
+ # 直接对文件提问
196
+ python rag_chain.py document.pdf "文档主要内容是什么?"
197
+
198
+ # OCR 识别并输出 Markdown
199
+ python ocr_loader.py scan.png --md
200
+
201
+ # OCR 识别并输出 JSON
202
+ python ocr_loader.py document.pdf --json
203
+ ```
204
 
205
+ ### 4. 分步使用
206
 
207
  ```python
208
+ from ocr_loader import PaddleOCRLoader
209
+ from text_processor import TextProcessingPipeline
210
+ from vector_store import build_vector_store
211
+ from rag_chain import RAGChain
212
+
213
+ # 1. OCR
214
+ loader = PaddleOCRLoader("document.pdf", dpi=300)
215
+ documents = loader.load()
216
 
217
+ # 2. 文本处理
218
+ pipeline = TextProcessingPipeline(chunk_size=800, chunk_overlap=150)
219
+ chunks = pipeline.process(documents)
220
 
221
+ # 3. 向量化
222
+ manager = build_vector_store(chunks)
223
 
224
+ # 4. 问答
225
+ chain = RAGChain(vector_store_manager=manager)
226
+ result = chain.query("文档主要内容?")
227
  ```
228
 
229
  ---
230
 
231
+ ## 案例演示
232
 
233
+ ### 演示视频
234
 
235
+ [观看演示视频](assets/OCR_RAG.mp4)
 
236
 
237
+ ### 使用步骤
238
 
239
+ **1. 在 AX650N 芯片上启动模型服务**
240
 
241
+ LLM 服务
242
 
243
+ ![alt text](assets/image-4.png)
244
 
245
+ Embedding 服务
246
 
247
+ ![alt text](assets/image-3.png)
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ OCR 服务
250
+
251
+ ![alt text](assets/image-2.png)
252
+
253
+ 运行启动服务
254
+
255
+ ![alt text](assets/image-5.png)
256
+
257
+ **2. 上传原始文件**
258
+
259
+ 支持 PDF / PNG / JPG / BMP / TIF
260
+
261
+ ![alt text](assets/image-12.png)
262
+
263
+ **3. 进行 OCR 识别**
264
+
265
+ OCR 识别并输出文本,支持原始文件和 OCR 结果同时查看:
266
+
267
+ ![alt text](assets/image-13.png)
268
+
269
+ ![alt text](assets/image-14.png)
270
+
271
+ **4. RAG 智能问答**
272
+
273
+ 根据输入内容检索相关文本片段并返回结果。
274
+
275
+ 例如提问「布洛芬每日用量」,系统检索到说明书中关于用量的文本片段,依据该文本进行回答:
276
+
277
+ ![alt text](assets/image-15.png)
278
 
279
+ ## 硬件资源使用
280
 
281
+ 基于 AX650N 平台运行本项目时,内存CMM、Flash 占用情况如下
282
 
283
+ ![alt text](assets/image-7.png)
__pycache__/app.cpython-310.pyc ADDED
Binary file (16 kB). View file
 
__pycache__/app.cpython-312.pyc ADDED
Binary file (20.9 kB). View file
 
__pycache__/config.cpython-310.pyc ADDED
Binary file (3.2 kB). View file
 
__pycache__/config.cpython-312.pyc ADDED
Binary file (4.98 kB). View file
 
__pycache__/embeddings.cpython-310.pyc ADDED
Binary file (5.54 kB). View file
 
__pycache__/embeddings.cpython-312.pyc ADDED
Binary file (8.24 kB). View file
 
__pycache__/ocr_loader.cpython-310.pyc ADDED
Binary file (22.5 kB). View file
 
__pycache__/ocr_loader.cpython-312.pyc ADDED
Binary file (37.8 kB). View file
 
__pycache__/rag_chain.cpython-310.pyc ADDED
Binary file (13 kB). View file
 
__pycache__/rag_chain.cpython-312.pyc ADDED
Binary file (19.6 kB). View file
 
__pycache__/run.cpython-310.pyc ADDED
Binary file (13.3 kB). View file
 
__pycache__/run.cpython-312.pyc ADDED
Binary file (20.6 kB). View file
 
__pycache__/text_processor.cpython-310.pyc ADDED
Binary file (14.3 kB). View file
 
__pycache__/text_processor.cpython-312.pyc ADDED
Binary file (22.7 kB). View file
 
__pycache__/vector_store.cpython-310.pyc ADDED
Binary file (9.13 kB). View file
 
__pycache__/vector_store.cpython-312.pyc ADDED
Binary file (13.9 kB). View file
 
app.py ADDED
@@ -0,0 +1,629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ OCR RAG 智能问答系统 - Web UI (FastAPI)
4
+ ============================================================
5
+
6
+ 启动:
7
+ python app.py
8
+ 访问: http://localhost:7860
9
+
10
+ """
11
+
12
+ import gc
13
+ import time
14
+ import shutil
15
+ from pathlib import Path
16
+ from typing import List, Optional, Dict, Any, Tuple
17
+
18
+
19
+
20
+ def _apply_env_patches():
21
+ """尽早修复已知的环境兼容性问题"""
22
+ import sys
23
+ import types
24
+
25
+ # Step 1: Mock `langchain_text_splitters` 以避免其 __init__.py
26
+ # 触发 sentence_transformers → transformers 损坏链
27
+ if "langchain_text_splitters" not in sys.modules:
28
+ mock_lts = types.ModuleType("langchain_text_splitters")
29
+ mock_lts.__path__ = []
30
+ sys.modules["langchain_text_splitters"] = mock_lts
31
+
32
+ # Step 2: 将我们的 RecursiveCharacterTextSplitter 注入到 mock 模块
33
+ mock_lts = sys.modules["langchain_text_splitters"]
34
+ from text_processor import RecursiveCharacterTextSplitter as OurSplitter
35
+ mock_lts.RecursiveCharacterTextSplitter = OurSplitter
36
+
37
+ # Step 3: 确保 torch 对 transformers 可用
38
+ if "torch" not in sys.modules:
39
+ try:
40
+ import torch # noqa: F401
41
+ except ImportError:
42
+ pass
43
+
44
+
45
+ _apply_env_patches()
46
+
47
+ from fastapi import FastAPI, File, Form, UploadFile, HTTPException
48
+ from fastapi.responses import HTMLResponse, FileResponse, JSONResponse
49
+ from fastapi.staticfiles import StaticFiles
50
+ from pydantic import BaseModel
51
+ from loguru import logger
52
+
53
+ import config
54
+ from rag_chain import PDFRAGPipeline, RAGChain
55
+ from vector_store import VectorStoreManager
56
+ from ocr_loader import PaddleOCRLoader
57
+ from text_processor import TextProcessingPipeline
58
+
59
+
60
+ # ============================================================
61
+ # 全局状态
62
+ # ============================================================
63
+
64
+ _pipeline: Optional[PDFRAGPipeline] = None
65
+ _processed_files: List[Dict[str, Any]] = []
66
+ _chat_history: List[Dict[str, str]] = []
67
+
68
+ # OCR 文本持久化目录
69
+ _OCR_OUTPUT_DIR = config.OCR_OUTPUT_DIR
70
+ _FILES_JSON = _OCR_OUTPUT_DIR / "_files.json"
71
+
72
+
73
+ def _load_files_from_disk():
74
+ """启动时从磁盘恢复已处理文件列表"""
75
+ global _processed_files
76
+ if _FILES_JSON.exists():
77
+ try:
78
+ import json
79
+ data = json.loads(_FILES_JSON.read_text(encoding="utf-8"))
80
+ _processed_files = data.get("files", [])
81
+ logger.info(f"从磁盘恢复 {len(_processed_files)} 个已处理文件")
82
+ except Exception as e:
83
+ logger.warning(f"恢复文件列表失败: {e}")
84
+
85
+
86
+ def _save_files_to_disk():
87
+ """将已处理文件列表持久化到磁盘"""
88
+ import json
89
+ _FILES_JSON.parent.mkdir(parents=True, exist_ok=True)
90
+ _FILES_JSON.write_text(
91
+ json.dumps({"files": _processed_files}, ensure_ascii=False, indent=2),
92
+ encoding="utf-8",
93
+ )
94
+
95
+
96
+ def _get_ocr_text_path(filename: str) -> Path:
97
+ """获取 OCR 文本的磁盘路径"""
98
+ return _OCR_OUTPUT_DIR / f"{Path(filename).stem}.txt"
99
+
100
+
101
+ def _save_ocr_text(filename: str, text: str):
102
+ """保存 OCR 文本到磁盘"""
103
+ path = _get_ocr_text_path(filename)
104
+ path.parent.mkdir(parents=True, exist_ok=True)
105
+ path.write_text(text, encoding="utf-8")
106
+
107
+
108
+ def _load_ocr_text(filename: str) -> str:
109
+ """从磁盘读取 OCR 文本"""
110
+ path = _get_ocr_text_path(filename)
111
+ if path.exists():
112
+ return path.read_text(encoding="utf-8")
113
+ return ""
114
+
115
+
116
+ def _delete_ocr_text(filename: str):
117
+ """从磁盘删除 OCR 文本"""
118
+ path = _get_ocr_text_path(filename)
119
+ if path.exists():
120
+ path.unlink()
121
+
122
+
123
+ def get_pipeline() -> PDFRAGPipeline:
124
+ global _pipeline
125
+ if _pipeline is None:
126
+ _pipeline = PDFRAGPipeline(verbose=False)
127
+ return _pipeline
128
+
129
+
130
+ # ============================================================
131
+ # 核心处理逻辑 (从原 Gradio 回调中提取)
132
+ # ============================================================
133
+
134
+ def process_file_impl(
135
+ file_path: Path,
136
+ chunk_size: int = 800,
137
+ chunk_overlap: int = 150,
138
+ ) -> Tuple[Dict[str, Any], str]:
139
+ """处理上传的文件: OCR → 分割 → 向量化入库"""
140
+ global _pipeline, _processed_files, _chat_history
141
+
142
+ suffix = file_path.suffix.lower()
143
+
144
+ if suffix not in config.SUPPORTED_FORMATS:
145
+ raise ValueError(
146
+ f"不支持的文件格式: {suffix}\n支持: {', '.join(sorted(config.SUPPORTED_FORMATS))}"
147
+ )
148
+
149
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
150
+ if file_size_mb > config.MAX_FILE_SIZE_MB:
151
+ raise ValueError(f"文件过大: {file_size_mb:.1f}MB (限制: {config.MAX_FILE_SIZE_MB}MB)")
152
+
153
+ # 复用 pipeline 对象避免重复创建 LLM 实例
154
+ if _pipeline is None:
155
+ _pipeline = PDFRAGPipeline(
156
+ chunk_size=int(chunk_size),
157
+ chunk_overlap=int(chunk_overlap),
158
+ verbose=False,
159
+ )
160
+
161
+ loader = PaddleOCRLoader(str(file_path), verbose=False)
162
+ raw_docs = loader.load()
163
+
164
+ # 逐页写入 OCR 文本到磁盘,避免内存中构建完整副本
165
+ ocr_path = _get_ocr_text_path(file_path.name)
166
+ ocr_path.parent.mkdir(parents=True, exist_ok=True)
167
+ with open(ocr_path, "w", encoding="utf-8") as ocr_f:
168
+ preview_parts = []
169
+ for i, doc in enumerate(raw_docs):
170
+ page_num = doc.metadata.get("page", i + 1)
171
+ ocr_f.write(f"--- 第 {page_num} 页 ---\n{doc.page_content}\n\n")
172
+ if i < 3:
173
+ preview_parts.append(
174
+ f"--- 第 {page_num} 页 ---\n{doc.page_content[:200]}..."
175
+ )
176
+ if len(raw_docs) > 3:
177
+ preview_parts.append(f"\n... (共 {len(raw_docs)} 页/文档)")
178
+ preview = "\n\n".join(preview_parts)
179
+
180
+ # 文本分割
181
+ pipeline = TextProcessingPipeline(
182
+ chunk_size=int(chunk_size),
183
+ chunk_overlap=int(chunk_overlap),
184
+ )
185
+ chunks = pipeline.process(raw_docs)
186
+
187
+ # 释放 raw_docs 引用,让 GC 可以回收
188
+ raw_docs.clear()
189
+
190
+ # 向量化入库
191
+ _pipeline._vector_store_manager = VectorStoreManager(
192
+ store_type=config.VECTOR_STORE_TYPE,
193
+ )
194
+ _pipeline._vector_store_manager.clear()
195
+ _pipeline._vector_store_manager.add_documents(chunks)
196
+
197
+ _pipeline._rag_chain = RAGChain(
198
+ vector_store_manager=_pipeline._vector_store_manager,
199
+ llm=_pipeline.llm,
200
+ )
201
+
202
+ _chat_history = []
203
+
204
+ file_info = {
205
+ "name": file_path.name,
206
+ "format": suffix,
207
+ "pages": len(raw_docs) if raw_docs else _count_ocr_pages(ocr_path),
208
+ "chunks": len(chunks),
209
+ "size_mb": round(file_size_mb, 2),
210
+ "time": time.strftime("%Y-%m-%d %H:%M:%S"),
211
+ "path": str(file_path),
212
+ }
213
+ _processed_files.append(file_info)
214
+
215
+ # 强制 GC 回收 OCR 过程中产生的临时对象
216
+ del chunks
217
+ gc.collect()
218
+
219
+ logger.info(f"文件处理成功: {file_path.name}, {file_info['pages']} 页, {file_info['chunks']} 块")
220
+ return file_info, preview
221
+
222
+
223
+ def _count_ocr_pages(ocr_path: Path) -> int:
224
+ """从保存的 OCR 文件统计页数"""
225
+ try:
226
+ text = ocr_path.read_text(encoding="utf-8")
227
+ return text.count("--- 第") or 1
228
+ except Exception:
229
+ return 1
230
+
231
+
232
+ def ask_question_impl(question: str) -> Dict[str, Any]:
233
+ """执行 RAG 问答"""
234
+ global _pipeline, _chat_history
235
+
236
+ if _pipeline is None or not _pipeline.is_ready:
237
+ raise RuntimeError("请先上传并处理文件")
238
+
239
+ result = _pipeline.ask_with_history(question, _chat_history)
240
+
241
+ _chat_history.append({"role": "user", "content": question})
242
+ _chat_history.append({"role": "assistant", "content": result["answer"]})
243
+ # 限制历史长度以防止内存无限增长 (保留最近 20 轮)
244
+ if len(_chat_history) > 40: # 20 pairs
245
+ _chat_history = _chat_history[-40:]
246
+
247
+ sources = []
248
+ for src in result.get("sources", []):
249
+ sources.append({
250
+ "rank": src["rank"],
251
+ "document": src["document"],
252
+ "page": src["page"],
253
+ "content_type": src.get("content_type", ""),
254
+ "content": src["content"][:200],
255
+ })
256
+
257
+ return {"answer": result["answer"], "sources": sources}
258
+
259
+
260
+ def clear_chat_impl():
261
+ global _chat_history
262
+ _chat_history = []
263
+
264
+
265
+ def get_system_status_impl() -> Dict[str, Any]:
266
+ global _pipeline, _processed_files
267
+
268
+ def _mask_key(key: str) -> str:
269
+ if not key or key == "not-needed":
270
+ return ""
271
+ if len(key) <= 8:
272
+ return "*" * len(key)
273
+ return key[:4] + "****" + key[-4:]
274
+
275
+ status = {
276
+ "embedding": {
277
+ "model": config.EMBEDDING_MODEL_NAME,
278
+ "api_base": config.EMBEDDING_API_BASE,
279
+ "api_key": _mask_key(config.EMBEDDING_API_KEY),
280
+ },
281
+ "llm": {
282
+ "model": config.LLM_MODEL_NAME,
283
+ "api_base": config.LLM_API_BASE,
284
+ "api_key": _mask_key(config.LLM_API_KEY),
285
+ },
286
+ "ocr": {
287
+ "engine": config.OCR_ENGINE,
288
+ "model": config.OCR_API_MODEL,
289
+ "api_base": config.OCR_API_BASE,
290
+ "api_key": _mask_key(config.OCR_API_KEY),
291
+ },
292
+ "vector_store": config.VECTOR_STORE_TYPE,
293
+ "params": {
294
+ "chunk_size": config.CHUNK_SIZE,
295
+ "chunk_overlap": config.CHUNK_OVERLAP,
296
+ "retrieval_top_k": config.RETRIEVAL_TOP_K,
297
+ },
298
+ "document_count": 0,
299
+ "files": _processed_files,
300
+ }
301
+
302
+ if _pipeline is not None:
303
+ try:
304
+ stats = _pipeline.stats
305
+ status["document_count"] = stats.get("document_count", 0)
306
+ except Exception:
307
+ pass
308
+
309
+ return status
310
+
311
+
312
+ def preload_ocr_engine():
313
+ """启动时预热 OCR 引擎, 避免首次上传等待模型加载"""
314
+ if config.OCR_ENGINE == "paddle":
315
+ try:
316
+ logger.info("预热 PaddleOCR-VL 引擎...")
317
+ from ocr_loader import _get_ocr_vl_pipeline
318
+ _get_ocr_vl_pipeline()
319
+ logger.info("OCR 引擎预热完成 ✓")
320
+ except Exception as e:
321
+ logger.warning(f"OCR 引擎预热跳过: {e}")
322
+ elif config.OCR_ENGINE == "api":
323
+ logger.info(f"OCR API 模式, 跳过预热 (endpoint: {config.OCR_API_BASE})")
324
+
325
+
326
+ # ============================================================
327
+ # FastAPI App
328
+ # ============================================================
329
+
330
+ app = FastAPI(title="PDF OCR 智能问答系统", version="2.0")
331
+
332
+ # Static files
333
+ STATIC_DIR = Path(__file__).resolve().parent / "static"
334
+ STATIC_DIR.mkdir(exist_ok=True)
335
+ app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
336
+
337
+
338
+ class ChatRequest(BaseModel):
339
+ question: str
340
+
341
+
342
+ class ChatResponse(BaseModel):
343
+ answer: str
344
+ sources: List[Dict[str, Any]]
345
+
346
+
347
+ # ── Routes ──
348
+
349
+
350
+ @app.get("/", response_class=HTMLResponse)
351
+ async def index():
352
+ """Serve the main frontend"""
353
+ index_path = STATIC_DIR / "index.html"
354
+ if index_path.exists():
355
+ return FileResponse(index_path)
356
+ return HTMLResponse("<h1>Frontend not found</h1>", status_code=404)
357
+
358
+
359
+ @app.post("/api/upload")
360
+ async def upload_files(
361
+ files: List[UploadFile] = File(...),
362
+ chunk_size: int = Form(800),
363
+ chunk_overlap: int = Form(150),
364
+ ):
365
+ """Upload and process multiple documents"""
366
+ if not files or all(not f.filename for f in files):
367
+ raise HTTPException(400, "No files provided")
368
+
369
+ upload_dir = config.UPLOAD_DIR
370
+ upload_dir.mkdir(parents=True, exist_ok=True)
371
+
372
+ results = []
373
+ all_errors = []
374
+
375
+ for file in files:
376
+ if not file.filename:
377
+ continue
378
+
379
+ tmp_path = upload_dir / file.filename
380
+ try:
381
+ with open(tmp_path, "wb") as f:
382
+ shutil.copyfileobj(file.file, f)
383
+
384
+ file_info, preview = process_file_impl(tmp_path, chunk_size, chunk_overlap)
385
+
386
+ results.append({
387
+ "success": True,
388
+ "name": file_info["name"],
389
+ "format": file_info["format"],
390
+ "pages": file_info["pages"],
391
+ "chunks": file_info["chunks"],
392
+ "size_mb": file_info["size_mb"],
393
+ "time": file_info["time"],
394
+ "preview": preview,
395
+ "message": "处理完成",
396
+ })
397
+ except ValueError as e:
398
+ all_errors.append(f"{file.filename}: {e}")
399
+ except Exception as e:
400
+ logger.error(f"处理失败 {file.filename}: {e}")
401
+ import traceback
402
+ traceback.print_exc()
403
+ all_errors.append(f"{file.filename}: {e}")
404
+
405
+ if not results and all_errors:
406
+ raise HTTPException(500, "; ".join(all_errors))
407
+
408
+ _save_files_to_disk()
409
+
410
+ return {
411
+ "success": True,
412
+ "results": results,
413
+ "errors": all_errors,
414
+ "total": len(results),
415
+ }
416
+
417
+
418
+ @app.delete("/api/files/{index}")
419
+ async def delete_file(index: int):
420
+ """Remove a processed file from the list by index"""
421
+ global _processed_files
422
+ if 0 <= index < len(_processed_files):
423
+ removed = _processed_files.pop(index)
424
+ _delete_ocr_text(removed["name"])
425
+ _save_files_to_disk()
426
+ logger.info(f"已移除文件: {removed['name']}")
427
+ return {"success": True, "removed": removed["name"]}
428
+ raise HTTPException(404, "File index not found")
429
+
430
+
431
+ @app.get("/api/preview/{index}")
432
+ async def get_preview(index: int):
433
+ """Get full OCR text for a processed file (reads from disk)"""
434
+ if 0 <= index < len(_processed_files):
435
+ filename = _processed_files[index]["name"]
436
+ text = _load_ocr_text(filename)
437
+ if text:
438
+ return {"success": True, "text": text, "index": index, "filename": filename}
439
+ return {"success": False, "text": "", "message": "OCR text file not found on disk"}
440
+ raise HTTPException(404, "File index out of range")
441
+
442
+
443
+ @app.get("/api/file/{index}")
444
+ async def get_original_file(index: int):
445
+ """Serve the original uploaded file for preview"""
446
+ if 0 <= index < len(_processed_files):
447
+ filename = _processed_files[index]["name"]
448
+ # 1) 尝试存储的路径
449
+ file_path = _processed_files[index].get("path", "")
450
+ if file_path and Path(file_path).exists():
451
+ return FileResponse(file_path)
452
+ # 2) 回退: 在 upload 目录中按文件名查找
453
+ fallback = config.UPLOAD_DIR / filename
454
+ if fallback.exists():
455
+ return FileResponse(str(fallback))
456
+ raise HTTPException(404, f"Original file not found: {filename}")
457
+ raise HTTPException(404, f"File index {index} out of range (total: {len(_processed_files)})")
458
+
459
+
460
+ @app.post("/api/chat", response_model=ChatResponse)
461
+ async def chat(req: ChatRequest):
462
+ """Ask a question about the processed document"""
463
+ try:
464
+ result = ask_question_impl(req.question)
465
+ return ChatResponse(**result)
466
+ except RuntimeError as e:
467
+ return {"answer": str(e), "sources": []}
468
+ except Exception as e:
469
+ logger.error(f"问答失败: {e}")
470
+ import traceback
471
+ traceback.print_exc()
472
+ return {"answer": f"问答失败: {str(e)}", "sources": []}
473
+
474
+
475
+ @app.delete("/api/chat")
476
+ async def clear_chat():
477
+ """Clear chat history"""
478
+ clear_chat_impl()
479
+ return {"success": True}
480
+
481
+
482
+ @app.get("/api/status")
483
+ async def get_status():
484
+ """Get system status"""
485
+ return get_system_status_impl()
486
+
487
+
488
+ # ── Config API ──
489
+
490
+ CONFIG_KEYS = {
491
+ "EMBEDDING_API_BASE", "EMBEDDING_MODEL_NAME", "EMBEDDING_API_KEY",
492
+ "LLM_API_BASE", "LLM_MODEL_NAME", "LLM_API_KEY",
493
+ "OCR_API_BASE", "OCR_API_MODEL", "OCR_API_KEY", "OCR_ENGINE",
494
+ "CHUNK_SIZE", "CHUNK_OVERLAP", "RETRIEVAL_TOP_K",
495
+ }
496
+
497
+
498
+ def _update_env_file(updates: Dict[str, str]):
499
+ """将配置变更写入 .env 文件"""
500
+ env_path = config.BASE_DIR / ".env"
501
+ if env_path.exists():
502
+ lines = env_path.read_text(encoding="utf-8").splitlines()
503
+ else:
504
+ lines = []
505
+
506
+ updated_keys = set()
507
+ new_lines = []
508
+ for line in lines:
509
+ stripped = line.strip()
510
+ if stripped and not stripped.startswith("#") and "=" in stripped:
511
+ key = stripped.split("=", 1)[0].strip()
512
+ if key in updates:
513
+ new_lines.append(f"{key}={updates[key]}")
514
+ updated_keys.add(key)
515
+ continue
516
+ new_lines.append(line)
517
+
518
+ for k, v in updates.items():
519
+ if k not in updated_keys:
520
+ new_lines.append(f"{k}={v}")
521
+
522
+ env_path.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
523
+
524
+
525
+ @app.get("/api/config")
526
+ async def get_config():
527
+ """获取当前 API 配置"""
528
+ return {
529
+ "embedding": {
530
+ "api_base": config.EMBEDDING_API_BASE,
531
+ "model_name": config.EMBEDDING_MODEL_NAME,
532
+ "api_key": config.EMBEDDING_API_KEY,
533
+ },
534
+ "llm": {
535
+ "api_base": config.LLM_API_BASE,
536
+ "model_name": config.LLM_MODEL_NAME,
537
+ "api_key": config.LLM_API_KEY,
538
+ },
539
+ "ocr": {
540
+ "engine": config.OCR_ENGINE,
541
+ "api_base": config.OCR_API_BASE,
542
+ "model_name": config.OCR_API_MODEL,
543
+ "api_key": config.OCR_API_KEY,
544
+ },
545
+ "retrieval": {
546
+ "chunk_size": config.CHUNK_SIZE,
547
+ "chunk_overlap": config.CHUNK_OVERLAP,
548
+ "top_k": config.RETRIEVAL_TOP_K,
549
+ },
550
+ }
551
+
552
+
553
+ @app.post("/api/config")
554
+ async def update_config(updates: Dict[str, str]):
555
+ """更新 API 配置 (写入 .env 并即时生效)"""
556
+ import os as _os
557
+
558
+ applied = {}
559
+ for key in updates:
560
+ if key in CONFIG_KEYS:
561
+ applied[key] = str(updates[key])
562
+ _os.environ[key] = str(updates[key])
563
+
564
+ if applied:
565
+ _update_env_file(applied)
566
+
567
+ # 重新加载 config 模块以生效
568
+ import importlib
569
+ importlib.reload(config)
570
+
571
+ # 重置全局单例使新配置生效
572
+ from embeddings import reset_embedding_model
573
+ reset_embedding_model()
574
+
575
+ logger.info(f"配置已更新: {list(applied.keys())}")
576
+
577
+ return {"success": True, "updated": list(applied.keys())}
578
+
579
+
580
+ # ============================================================
581
+ # Main
582
+ # ============================================================
583
+
584
+ def main():
585
+ import uvicorn
586
+
587
+ logger.remove()
588
+ logger.add(
589
+ config.LOG_DIR / "app_{time:YYYY-MM-DD}.log",
590
+ level=config.LOG_LEVEL,
591
+ format=config.LOG_FORMAT,
592
+ rotation="100 MB",
593
+ retention="30 days",
594
+ encoding="utf-8",
595
+ )
596
+ logger.add(
597
+ lambda msg: print(msg, end=""),
598
+ level="INFO",
599
+ format="<green>{time:HH:mm:ss}</green> | <level>{level: <8}</level> | <level>{message}</level>",
600
+ colorize=True,
601
+ )
602
+
603
+ logger.info("=" * 50)
604
+ logger.info(" PDF OCR 智能问答系统 启动中...")
605
+ logger.info("=" * 50)
606
+ logger.info(f" OCR: PaddleOCR-VL-1.5 ({config.OCR_VL_BACKEND})")
607
+ logger.info(f" 嵌入: {config.EMBEDDING_MODEL_NAME} (API: {config.EMBEDDING_API_BASE})")
608
+ logger.info(f" LLM: {config.LLM_MODEL_NAME} (API: {config.LLM_API_BASE})")
609
+ logger.info(f" OCR: {config.OCR_ENGINE} ({config.OCR_API_BASE if config.OCR_ENGINE == 'api' else 'local'})")
610
+ logger.info(f" 向量数据库: {config.VECTOR_STORE_TYPE}")
611
+ logger.info(f" 支持格式: {sorted(config.SUPPORTED_FORMATS)}")
612
+
613
+ # 从磁盘恢复已处理文件列表
614
+ _load_files_from_disk()
615
+
616
+ # 预热 OCR 引擎
617
+ preload_ocr_engine()
618
+
619
+ uvicorn.run(
620
+ app,
621
+ host="0.0.0.0",
622
+ port=7860,
623
+ reload=False,
624
+ log_level="info",
625
+ )
626
+
627
+
628
+ if __name__ == "__main__":
629
+ main()
assets/OCR_RAG.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c39f595d1822c620fde84ef317c2ea95b24edb8568214e06aec97e3f5251bed
3
+ size 2590191
assets/image-1.png ADDED

Git LFS Details

  • SHA256: beb6fca3b3b33715d1001ad58b1eccdfed1cff7b8b2d2775165f4b46d6345f96
  • Pointer size: 131 Bytes
  • Size of remote file: 105 kB
assets/image-12.png ADDED
assets/image-13.png ADDED

Git LFS Details

  • SHA256: 29bad2dd98b3ab0986b815943b5b90680c039d79941ad116dde3abe09426dd92
  • Pointer size: 131 Bytes
  • Size of remote file: 143 kB
assets/image-14.png ADDED

Git LFS Details

  • SHA256: 30dc40d34bd91a4a3a2d8770f4394afaab32fea5c9e0f7c65e1858d8d7542131
  • Pointer size: 131 Bytes
  • Size of remote file: 285 kB
assets/image-15.png ADDED
assets/image-16.png ADDED

Git LFS Details

  • SHA256: 92c5e29dfd27636add95436fc2c3384fd11210072cae54486148a2255eb55991
  • Pointer size: 131 Bytes
  • Size of remote file: 112 kB
assets/image-2.png ADDED
assets/image-3.png ADDED
assets/image-4.png ADDED

Git LFS Details

  • SHA256: 009ce7407dcc238ea3be3ea95fa51b0aee6c101350a1f8e78fe11a6021baf3ee
  • Pointer size: 130 Bytes
  • Size of remote file: 88.2 kB
assets/image-5.png ADDED

Git LFS Details

  • SHA256: e46b3987d752e684e2bbb36be6f1954cea53bd2972881730dc0b6cf86456855a
  • Pointer size: 130 Bytes
  • Size of remote file: 95.7 kB
assets/image-7.png ADDED

Git LFS Details

  • SHA256: f5f84175ec7c81bbf4cb5104e7099a570e09bedf625f56a3c24d71ff20d754c6
  • Pointer size: 130 Bytes
  • Size of remote file: 74.7 kB
assets/image.png ADDED

Git LFS Details

  • SHA256: d1db47b27a5c28c2c76448219f132ba1d392560902520cdbf5fb1676b67426e8
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
config.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ OCR RAG 智能问答系统 - 全局配置
4
+ ============================================================
5
+
6
+ """
7
+
8
+ import os
9
+ from pathlib import Path
10
+ from dotenv import load_dotenv
11
+
12
+ load_dotenv()
13
+
14
+ # ---- 项目路径 ----
15
+ BASE_DIR = Path(__file__).resolve().parent
16
+ DATA_DIR = BASE_DIR / "data"
17
+ UPLOAD_DIR = DATA_DIR / "uploads"
18
+ OCR_OUTPUT_DIR = DATA_DIR / "ocr_output"
19
+ VECTOR_DB_DIR = DATA_DIR / "vector_db"
20
+ LOG_DIR = DATA_DIR / "logs"
21
+
22
+ for d in [DATA_DIR, UPLOAD_DIR, OCR_OUTPUT_DIR, VECTOR_DB_DIR, LOG_DIR]:
23
+ d.mkdir(parents=True, exist_ok=True)
24
+
25
+ # ============================================================
26
+ # PaddleOCR-VL-1.5 配置
27
+ # ============================================================
28
+ # PaddleOCR-VL-1.5: 0.9B 视觉语言模型, OmniDocBench v1.5 94.5% 精度
29
+ # 支持: PDF / PNG / JPG / BMP / TIF
30
+ # OCR 引擎:
31
+ # paddle - PaddleOCR pipeline (默认, 版面分析 + 页面级解析, 推荐)
32
+ # transformers - transformers v5 原生推理 (元素级识别, 轻量)
33
+ # PaddleOCR 后端 (仅 engine=paddle 时生效):
34
+ # native - 本地 PaddlePaddle 推理
35
+ # vllm-server - vLLM 服务端 (高吞吐)
36
+ # llama-cpp-server - llama.cpp GGUF (边缘设备)
37
+
38
+ OCR_ENGINE = os.getenv("OCR_ENGINE", "paddle") # paddle / api
39
+
40
+ # OCR API 配置 (OCR_ENGINE=api, 通过 OpenAI 兼容 API 调用)
41
+ # vLLM 部署:
42
+ # python -m vllm.entrypoints.openai.api_server \
43
+ # --model PaddlePaddle/PaddleOCR-VL-1.5 --trust-remote-code --port 8002
44
+ OCR_API_BASE = os.getenv("OCR_API_BASE", "http://127.0.0.1:8002/v1")
45
+ OCR_API_KEY = os.getenv("OCR_API_KEY", "not-needed")
46
+ OCR_API_MODEL = os.getenv("OCR_API_MODEL", "PaddleOCR-VL-1.5")
47
+ OCR_TASK = os.getenv("OCR_TASK", "ocr") # ocr / table / chart / formula / spotting / seal
48
+
49
+ OCR_VL_BACKEND = os.getenv("OCR_VL_BACKEND", "native")
50
+ OCR_VL_SERVER_URL = os.getenv("OCR_VL_SERVER_URL", "http://127.0.0.1:8080/v1")
51
+
52
+ OCR_USE_LAYOUT = os.getenv("OCR_USE_LAYOUT", "true").lower() == "true"
53
+ OCR_LAYOUT_THRESHOLD = float(os.getenv("OCR_LAYOUT_THRESHOLD", "0.5"))
54
+ OCR_USE_CHART = os.getenv("OCR_USE_CHART", "false").lower() == "true"
55
+
56
+ OCR_MAX_NEW_TOKENS = int(os.getenv("OCR_MAX_NEW_TOKENS", "4096"))
57
+ OCR_TEMPERATURE = float(os.getenv("OCR_TEMPERATURE", "0.0"))
58
+
59
+ PDF_RENDER_DPI = int(os.getenv("PDF_RENDER_DPI", "300"))
60
+
61
+ SUPPORTED_IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}
62
+ SUPPORTED_FORMATS = {".pdf"} | SUPPORTED_IMAGE_FORMATS
63
+
64
+ MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50"))
65
+
66
+ # ============================================================
67
+ # 文本分割
68
+ # ============================================================
69
+ CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "800"))
70
+ CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "150"))
71
+ SEPARATORS = ["\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";", " ", ""]
72
+
73
+ # ============================================================
74
+ # Embedding API 配置 (OpenAI 兼容格式)
75
+ # ============================================================
76
+
77
+
78
+ EMBEDDING_MODEL_NAME = os.getenv(
79
+ "EMBEDDING_MODEL_NAME", "Qwen/Qwen3-Embedding-0.6B"
80
+ )
81
+ EMBEDDING_API_BASE = os.getenv(
82
+ "EMBEDDING_API_BASE", "http://127.0.0.1:8001/v1"
83
+ )
84
+ EMBEDDING_API_KEY = os.getenv("EMBEDDING_API_KEY", "not-needed")
85
+ EMBEDDING_BATCH_SIZE = int(os.getenv("EMBEDDING_BATCH_SIZE", "10"))
86
+
87
+ # ============================================================
88
+ # 向量数据库
89
+ # ============================================================
90
+ VECTOR_STORE_TYPE = os.getenv("VECTOR_STORE_TYPE", "chroma")
91
+ CHROMA_COLLECTION_NAME = os.getenv("CHROMA_COLLECTION_NAME", "pdf_ocr_knowledge")
92
+ RETRIEVAL_TOP_K = int(os.getenv("RETRIEVAL_TOP_K", "3"))
93
+
94
+ # ============================================================
95
+ # LLM API 配置 (OpenAI 兼容格式)
96
+ # ============================================================
97
+
98
+
99
+ LLM_API_KEY = os.getenv("LLM_API_KEY", "not-needed")
100
+ LLM_API_BASE = os.getenv("LLM_API_BASE", "http://127.0.0.1:8000/v1")
101
+ LLM_MODEL_NAME = os.getenv("LLM_MODEL_NAME", "Qwen/Qwen3-8B")
102
+ LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.1"))
103
+ LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "512"))
104
+
105
+ # ============================================================
106
+ # 系统 Prompt
107
+ # ============================================================
108
+ SYSTEM_PROMPT = """根据以下文档内容,简洁回答用户问题。只依据文档内容回答,不要编造。使用中文。"""
109
+
110
+ RAG_PROMPT_TEMPLATE = """{system_prompt}
111
+
112
+ ## 参考文档内容:
113
+ {context}
114
+
115
+ ## 用户问题:
116
+ {question}
117
+
118
+ ## 回答:"""
119
+
120
+ # ============================================================
121
+ # 日志
122
+ # ============================================================
123
+ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO")
124
+ LOG_FORMAT = "{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} - {message}"
embeddings.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ 向量嵌入模块 (OpenAI 兼容 API)
4
+ ============================================================
5
+ 直接使用 openai 客户端, 兼容:
6
+ - 阿里云 DashScope (text-embedding-v4 等)
7
+ - vLLM 部署的 Qwen3-Embedding
8
+ - 任意 OpenAI 兼容嵌入服务
9
+
10
+ 用法:
11
+ model = get_embedding_model()
12
+ vec = model.embed_query("查询文本")
13
+ vecs = model.embed_documents(["文本1", "文本2"])
14
+ """
15
+
16
+ from typing import List, Optional
17
+ import numpy as np
18
+
19
+ from langchain_core.embeddings import Embeddings
20
+ from openai import OpenAI
21
+
22
+ from loguru import logger
23
+
24
+ import config
25
+
26
+
27
+ # ============================================================
28
+ # 通用 OpenAI 兼容嵌入类
29
+ # ============================================================
30
+
31
+ class OpenAICompatEmbeddings(Embeddings):
32
+ """
33
+ 轻量级 OpenAI 兼容嵌入类
34
+
35
+ 直接使用 openai 客户端发送请求, 避免 langchain_openai 的额外封装
36
+ 导致的 API 兼容性问题 (如 DashScope 的参数校验差异)。
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ model: Optional[str] = None,
42
+ api_key: Optional[str] = None,
43
+ base_url: Optional[str] = None,
44
+ batch_size: Optional[int] = None,
45
+ dimensions: Optional[int] = None,
46
+ ):
47
+ self.model = model or config.EMBEDDING_MODEL_NAME
48
+ self.batch_size = batch_size if batch_size is not None else config.EMBEDDING_BATCH_SIZE
49
+ self.dimensions = dimensions
50
+
51
+ self._client = OpenAI(
52
+ api_key=api_key or config.EMBEDDING_API_KEY,
53
+ base_url=base_url or config.EMBEDDING_API_BASE,
54
+ )
55
+
56
+ logger.info(
57
+ f"Embedding API 连接: model={self.model}, "
58
+ f"base_url={base_url or config.EMBEDDING_API_BASE}"
59
+ )
60
+
61
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
62
+ """批量嵌入文档"""
63
+ if not texts:
64
+ return []
65
+
66
+ all_embeddings = []
67
+
68
+ for i in range(0, len(texts), self.batch_size):
69
+ batch = texts[i : i + self.batch_size]
70
+ kwargs = dict(model=self.model, input=batch)
71
+ if self.dimensions:
72
+ kwargs["dimensions"] = self.dimensions
73
+
74
+ response = self._client.embeddings.create(**kwargs)
75
+ # response.data 按输入顺序返回
76
+ batch_embeddings = [item.embedding for item in response.data]
77
+ all_embeddings.extend(batch_embeddings)
78
+
79
+ if len(texts) > self.batch_size:
80
+ logger.debug(
81
+ f"嵌入进度: {min(i + self.batch_size, len(texts))}/{len(texts)}"
82
+ )
83
+
84
+ return all_embeddings
85
+
86
+ def embed_query(self, text: str) -> List[float]:
87
+ """嵌入查询文本"""
88
+ kwargs = dict(model=self.model, input=text)
89
+ if self.dimensions:
90
+ kwargs["dimensions"] = self.dimensions
91
+
92
+ response = self._client.embeddings.create(**kwargs)
93
+ return response.data[0].embedding
94
+
95
+
96
+ # ============================================================
97
+ # 全局单例
98
+ # ============================================================
99
+
100
+ _embedding_model: Optional[Embeddings] = None
101
+
102
+
103
+ def get_embedding_model(
104
+ model_name: Optional[str] = None,
105
+ api_base: Optional[str] = None,
106
+ ) -> Embeddings:
107
+ """获取全局嵌入模型单例"""
108
+ global _embedding_model
109
+ if _embedding_model is None:
110
+ _embedding_model = OpenAICompatEmbeddings(
111
+ model=model_name,
112
+ base_url=api_base,
113
+ )
114
+ return _embedding_model
115
+
116
+
117
+ def reset_embedding_model():
118
+ """重置嵌入模型单例"""
119
+ global _embedding_model
120
+ _embedding_model = None
121
+ logger.info("嵌入模型已重置")
122
+
123
+
124
+ # ============================================================
125
+ # 工具函数
126
+ # ============================================================
127
+
128
+ def compute_similarity(vec1: List[float], vec2: List[float]) -> float:
129
+ """计算余弦相似度"""
130
+ v1, v2 = np.array(vec1), np.array(vec2)
131
+ denom = np.linalg.norm(v1) * np.linalg.norm(v2)
132
+ if denom == 0:
133
+ return 0.0
134
+ return float(np.dot(v1, v2) / denom)
135
+
136
+
137
+ def batch_embed(
138
+ texts: List[str],
139
+ model: Optional[Embeddings] = None,
140
+ batch_size: Optional[int] = None,
141
+ show_progress: bool = False,
142
+ ) -> List[List[float]]:
143
+ """批量嵌入文本 (支持自定义 batch_size)"""
144
+ if model is None:
145
+ model = get_embedding_model()
146
+
147
+ all_embeddings = []
148
+ total = len(texts)
149
+ bs = batch_size or config.EMBEDDING_BATCH_SIZE
150
+
151
+ for i in range(0, total, bs):
152
+ batch = texts[i : i + bs]
153
+ embeddings = model.embed_documents(batch)
154
+ all_embeddings.extend(embeddings)
155
+
156
+ if show_progress and i + bs < total:
157
+ logger.debug(f"嵌入进度: {min(i + bs, total)}/{total}")
158
+
159
+ return all_embeddings
160
+
161
+
162
+ # ============================================================
163
+ # 测试入口
164
+ # ============================================================
165
+
166
+ if __name__ == "__main__":
167
+ print("测试 Embedding API 连接...\n")
168
+ print(f"API: {config.EMBEDDING_API_BASE}")
169
+ print(f"模型: {config.EMBEDDING_MODEL_NAME}")
170
+
171
+ try:
172
+ model = get_embedding_model()
173
+
174
+ test_texts = [
175
+ "这是第一段测试文本,用于验证嵌入API是否正常工作。",
176
+ "这是第二段完全不同的文本内容,涉及人工智能话题。",
177
+ "向量嵌入是自然语言处理中的基础技术。",
178
+ ]
179
+
180
+ print("\n测试单文本嵌入 (embed_query)...")
181
+ query_vec = model.embed_query("嵌入模型测试")
182
+ print(f" 维度: {len(query_vec)}")
183
+
184
+ print("\n测试批量嵌入 (embed_documents)...")
185
+ doc_vecs = model.embed_documents(test_texts)
186
+ print(f" 数量: {len(doc_vecs)}, 维度: {len(doc_vecs[0])}")
187
+
188
+ print("\n测试相似度计算...")
189
+ sim1 = compute_similarity(doc_vecs[2], query_vec)
190
+ sim2 = compute_similarity(doc_vecs[0], query_vec)
191
+ print(f" 查询 vs 向量嵌入文本: {sim1:.4f}")
192
+ print(f" 查询 vs 无关文本: {sim2:.4f}")
193
+
194
+ print(f"\n✓ Embedding API 测试通过")
195
+
196
+ except Exception as e:
197
+ print(f"\n✗ API 连接失败: {e}")
198
+ print(f" 请确保 Embedding API 服务已启动: {config.EMBEDDING_API_BASE}")
ocr_loader.py ADDED
@@ -0,0 +1,829 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ PaddleOCR-VL-1.5 文档加载器
4
+ ============================================================
5
+ 模型: PaddleOCR-VL-1.5 (0.9B 视觉语言模型, OmniDocBench v1.5 94.5% 精度)
6
+ 支持格式: PDF / PNG / JPG / JPEG / BMP / TIF / TIFF
7
+
8
+ 功能:
9
+ 1. 文档 (PDF/图片) → PaddleOCR-VL-1.5 端到端识别
10
+ 2. 输出 Markdown/JSON 结构化结果 (含版面/表格/公式/印章)
11
+ 3. 转换为 LangChain Document 对象
12
+ """
13
+
14
+ import gc
15
+ import time
16
+ import warnings
17
+ from pathlib import Path
18
+ from typing import List, Optional, Iterator, Dict, Any, Union
19
+ from dataclasses import dataclass, field
20
+
21
+ import fitz # PyMuPDF: PDF 页面渲染和元数据提取
22
+ import numpy as np
23
+ from PIL import Image
24
+
25
+ from langchain_core.documents import Document
26
+
27
+ from loguru import logger
28
+
29
+ import config
30
+
31
+ warnings.filterwarnings("ignore")
32
+
33
+
34
+ # ============================================================
35
+ # PaddleOCR-VL-1.5 全局单例
36
+ # ============================================================
37
+
38
+ _ocr_vl_pipeline = None
39
+
40
+
41
+ def _get_ocr_vl_pipeline():
42
+ """懒加载 PaddleOCR-VL-1.5 模型 (单例)"""
43
+ global _ocr_vl_pipeline
44
+ if _ocr_vl_pipeline is None:
45
+ from paddleocr import PaddleOCRVL
46
+ logger.info(
47
+ f"正在初始化 PaddleOCR-VL-1.5 模型 "
48
+ f"(backend={config.OCR_VL_BACKEND})..."
49
+ )
50
+
51
+ kwargs = dict(
52
+ use_layout_detection=config.OCR_USE_LAYOUT,
53
+ use_chart_recognition=config.OCR_USE_CHART,
54
+ merge_layout_blocks=True,
55
+ layout_threshold=config.OCR_LAYOUT_THRESHOLD,
56
+ )
57
+
58
+ if config.OCR_VL_BACKEND == "vllm-server":
59
+ kwargs["vl_rec_backend"] = "vllm-server"
60
+ kwargs["vl_rec_server_url"] = config.OCR_VL_SERVER_URL
61
+ elif config.OCR_VL_BACKEND == "llama-cpp-server":
62
+ kwargs["vl_rec_backend"] = "llama-cpp-server"
63
+ kwargs["vl_rec_server_url"] = config.OCR_VL_SERVER_URL
64
+
65
+ _ocr_vl_pipeline = PaddleOCRVL(**kwargs)
66
+ logger.info("PaddleOCR-VL-1.5 模型初始化完成 ✓")
67
+ return _ocr_vl_pipeline
68
+
69
+
70
+ # ============================================================
71
+ # 数据结构
72
+ # ============================================================
73
+
74
+ @dataclass
75
+ class OCRResult:
76
+ """单页/单图 OCR 结果"""
77
+ page_num: int = 0
78
+ markdown_text: str = ""
79
+ json_data: Optional[Dict[str, Any]] = None
80
+ text_blocks: List[Dict[str, Any]] = field(default_factory=list)
81
+ tables: List[Dict[str, Any]] = field(default_factory=list)
82
+ formulas: List[Dict[str, Any]] = field(default_factory=list)
83
+ images_in_page: List[Dict[str, Any]] = field(default_factory=list)
84
+ layout_regions: List[Dict[str, Any]] = field(default_factory=list)
85
+ ocr_time_ms: float = 0.0
86
+ source_format: str = "" # pdf / png / jpg / ...
87
+
88
+
89
+ # ============================================================
90
+ # PaddleOCR-VL-1.5 文本提取器
91
+ # ============================================================
92
+
93
+ class VLOCRExtractor:
94
+ """使用 PaddleOCR-VL-1.5 从文档中提取结构化内容"""
95
+
96
+ @staticmethod
97
+ def extract(image_or_path: Union[str, Path, np.ndarray]) -> List[OCRResult]:
98
+ """
99
+ 对单张图片或 PDF 执行 OCR 识别
100
+
101
+ Args:
102
+ image_or_path: 图片路径 / PDF路径 / numpy 数组
103
+
104
+ Returns:
105
+ OCRResult 列表 (PDF 为多页, 图片为单页)
106
+ """
107
+ pipeline = _get_ocr_vl_pipeline()
108
+ start_time = time.time()
109
+
110
+ logger.info("PaddleOCR-VL 正在推理中 (首次调用较慢, CPU 约 30-60s/页) ...")
111
+ raw_output = pipeline.predict(image_or_path)
112
+ logger.info(f"推理完成, 耗时 {time.time() - start_time:.1f}s")
113
+ results = []
114
+ for i, res in enumerate(raw_output):
115
+ page_result = OCRResult(
116
+ page_num=i + 1,
117
+ ocr_time_ms=(time.time() - start_time) * 1000 / len(raw_output),
118
+ )
119
+
120
+ # 尝试获取 structured JSON
121
+ try:
122
+ json_data = res.json
123
+ if json_data:
124
+ page_result.json_data = json_data
125
+ # 解析结构化内容
126
+ page_result.text_blocks = VLOCRExtractor._parse_text_blocks(json_data)
127
+ page_result.tables = VLOCRExtractor._parse_tables(json_data)
128
+ page_result.formulas = VLOCRExtractor._parse_formulas(json_data)
129
+ except Exception as e:
130
+ logger.debug(f"JSON 解析跳过: {e}")
131
+
132
+ # 获取 Markdown 文本
133
+ try:
134
+ md = res.markdown
135
+ if isinstance(md, dict):
136
+ page_result.markdown_text = md.get("text", "") or ""
137
+ elif isinstance(md, str):
138
+ page_result.markdown_text = md
139
+ else:
140
+ page_result.markdown_text = str(md) if md else ""
141
+ except Exception:
142
+ page_result.markdown_text = ""
143
+
144
+ # 回退: markdown 为空时从 JSON blocks 构建文本
145
+ if not page_result.markdown_text and page_result.json_data:
146
+ page_result.markdown_text = VLOCRExtractor._build_text_from_blocks(
147
+ page_result.json_data
148
+ )
149
+
150
+ results.append(page_result)
151
+
152
+ return results
153
+
154
+ @staticmethod
155
+ def extract_text(image_or_path: Union[str, Path, np.ndarray]) -> str:
156
+ """便捷方法: 只返回纯文本 (合并所有页)"""
157
+ results = VLOCRExtractor.extract(image_or_path)
158
+ return "\n\n".join(r.markdown_text for r in results if r.markdown_text)
159
+
160
+ @staticmethod
161
+ def extract_to_markdown(image_or_path: Union[str, Path, np.ndarray]) -> str:
162
+ """返回完整的 Markdown 格式文本"""
163
+ return VLOCRExtractor.extract_text(image_or_path)
164
+
165
+ @staticmethod
166
+ def extract_to_json(
167
+ image_or_path: Union[str, Path, np.ndarray],
168
+ save_path: Optional[str] = None,
169
+ ) -> Dict[str, Any]:
170
+ """返回结构化 JSON 或保存到文件"""
171
+ results = VLOCRExtractor.extract(image_or_path)
172
+ output = {
173
+ "pages": [],
174
+ "total_pages": len(results),
175
+ }
176
+ for r in results:
177
+ page_data = {
178
+ "page_num": r.page_num,
179
+ "markdown": r.markdown_text,
180
+ "json": r.json_data,
181
+ "tables": r.tables,
182
+ "formulas": r.formulas,
183
+ }
184
+ output["pages"].append(page_data)
185
+
186
+ if save_path:
187
+ import json
188
+ save_path = Path(save_path)
189
+ save_path.parent.mkdir(parents=True, exist_ok=True)
190
+ with open(save_path, "w", encoding="utf-8") as f:
191
+ json.dump(output, f, ensure_ascii=False, indent=2)
192
+ logger.info(f"OCR 结果已保存: {save_path}")
193
+
194
+ return output
195
+
196
+ # ---- 结构化解析辅助 ----
197
+
198
+ @staticmethod
199
+ def _get_parsing_list(json_data: Dict) -> List[Dict]:
200
+ """从 PaddleOCR-VL JSON 中提取 parsing_res_list"""
201
+ res = json_data.get("res", json_data)
202
+ return res.get("parsing_res_list", [])
203
+
204
+ @staticmethod
205
+ def _parse_text_blocks(json_data: Dict) -> List[Dict[str, Any]]:
206
+ """从 parsing_res_list 中提取文本块"""
207
+ blocks = []
208
+ for item in VLOCRExtractor._get_parsing_list(json_data):
209
+ label = item.get("block_label", "")
210
+ content = item.get("block_content", "")
211
+ bbox = item.get("block_bbox", [])
212
+ if content and label not in ("image",):
213
+ blocks.append({
214
+ "type": label,
215
+ "text": content,
216
+ "bbox": bbox,
217
+ })
218
+ return blocks
219
+
220
+ @staticmethod
221
+ def _parse_tables(json_data: Dict) -> List[Dict[str, Any]]:
222
+ """从 parsing_res_list 中提取表格"""
223
+ tables = []
224
+ for item in VLOCRExtractor._get_parsing_list(json_data):
225
+ if item.get("block_label") == "table":
226
+ tables.append({
227
+ "text": item.get("block_content", ""),
228
+ "html": item.get("block_html", ""),
229
+ "markdown": item.get("block_markdown", ""),
230
+ "bbox": item.get("block_bbox", []),
231
+ })
232
+ return tables
233
+
234
+ @staticmethod
235
+ def _parse_formulas(json_data: Dict) -> List[Dict[str, Any]]:
236
+ """从 parsing_res_list 中提取公式"""
237
+ formulas = []
238
+ for item in VLOCRExtractor._get_parsing_list(json_data):
239
+ if item.get("block_label") == "formula":
240
+ formulas.append({
241
+ "latex": item.get("block_latex", ""),
242
+ "text": item.get("block_content", ""),
243
+ "bbox": item.get("block_bbox", []),
244
+ })
245
+ return formulas
246
+
247
+ @staticmethod
248
+ def _build_text_from_blocks(json_data: Dict) -> str:
249
+ """从 parsing_res_list 构建纯文本"""
250
+ lines = []
251
+ for item in VLOCRExtractor._get_parsing_list(json_data):
252
+ label = item.get("block_label", "")
253
+ content = item.get("block_content", "")
254
+ if not content:
255
+ continue
256
+ if label == "table":
257
+ lines.append(f"[表格] {content}")
258
+ elif label == "formula":
259
+ lines.append(f"[公式] {content}")
260
+ elif label in ("paragraph_title", "header"):
261
+ lines.append(f"## {content}")
262
+ elif label == "image":
263
+ continue # 跳过纯图片块
264
+ else:
265
+ lines.append(content)
266
+ return "\n\n".join(lines)
267
+
268
+
269
+ # ============================================================
270
+ # OCR API 提取器 (OpenAI 兼容格式, 无需本地推理)
271
+ # ============================================================
272
+
273
+ _ocr_api_client = None
274
+
275
+
276
+ def _get_ocr_api_client():
277
+ """懒加载 OCR API 客户端"""
278
+ global _ocr_api_client
279
+ if _ocr_api_client is None:
280
+ from openai import OpenAI
281
+ _ocr_api_client = OpenAI(
282
+ api_key=config.OCR_API_KEY,
283
+ base_url=config.OCR_API_BASE,
284
+ )
285
+ logger.info(
286
+ f"OCR API 连接: model={config.OCR_API_MODEL}, "
287
+ f"base_url={config.OCR_API_BASE}"
288
+ )
289
+ return _ocr_api_client
290
+
291
+
292
+ class OCRApiExtractor:
293
+ """
294
+ 基于 OpenAI 兼容 API 的 PaddleOCR-VL-1.5 提取器
295
+
296
+ 通过 vLLM 或其他 OpenAI 兼容服务调用, 无需本地 GPU 推理。
297
+
298
+ 支持任务: ocr / table / formula / chart / spotting / seal
299
+ """
300
+
301
+ PROMPTS = {
302
+ "ocr": "OCR:",
303
+ "table": "Table Recognition:",
304
+ "formula": "Formula Recognition:",
305
+ "chart": "Chart Recognition:",
306
+ "spotting": "Spotting:",
307
+ "seal": "Seal Recognition:",
308
+ }
309
+
310
+ @staticmethod
311
+ def extract(
312
+ image_or_path: Union[str, Path, np.ndarray],
313
+ task: Optional[str] = None,
314
+ max_new_tokens: int = 2048,
315
+ ) -> List[OCRResult]:
316
+ """
317
+ 通过 API 执行 OCR 识别
318
+
319
+ Args:
320
+ image_or_path: 图片路径 / numpy 数组
321
+ task: 任务类型
322
+ max_new_tokens: 最大生成 token 数
323
+
324
+ Returns:
325
+ OCRResult 列表
326
+ """
327
+ import base64
328
+ import io
329
+
330
+ task = task or config.OCR_TASK
331
+ client = _get_ocr_api_client()
332
+
333
+ start_time = time.time()
334
+ logger.info(f"OCR API 推理中 (task={task}) ...")
335
+
336
+ # 图片 → base64 data URL
337
+ if isinstance(image_or_path, (str, Path)):
338
+ with open(image_or_path, "rb") as f:
339
+ img_bytes = f.read()
340
+ elif isinstance(image_or_path, np.ndarray):
341
+ img = Image.fromarray(image_or_path).convert("RGB")
342
+ buf = io.BytesIO()
343
+ img.save(buf, format="PNG")
344
+ img_bytes = buf.getvalue()
345
+ else:
346
+ img_bytes = image_or_path
347
+
348
+ b64 = base64.b64encode(img_bytes).decode("utf-8")
349
+ image_url = f"data:image/png;base64,{b64}"
350
+
351
+ messages = [{
352
+ "role": "user",
353
+ "content": [
354
+ {"type": "image_url", "image_url": {"url": image_url}},
355
+ {"type": "text", "text": OCRApiExtractor.PROMPTS[task]},
356
+ ],
357
+ }]
358
+
359
+ response = client.chat.completions.create(
360
+ model=config.OCR_API_MODEL,
361
+ messages=messages,
362
+ max_tokens=max_new_tokens,
363
+ )
364
+
365
+ result_text = response.choices[0].message.content.strip()
366
+ elapsed = (time.time() - start_time) * 1000
367
+
368
+ result = OCRResult(
369
+ page_num=1,
370
+ markdown_text=result_text,
371
+ ocr_time_ms=elapsed,
372
+ source_format="image",
373
+ text_blocks=[{"type": task, "text": result_text, "bbox": []}],
374
+ )
375
+
376
+ logger.info(f"OCR API 完成, 耗时 {elapsed:.0f}ms, {len(result_text)} 字符")
377
+ return [result]
378
+
379
+ @staticmethod
380
+ def extract_text(
381
+ image_or_path: Union[str, Path, np.ndarray],
382
+ task: Optional[str] = None,
383
+ ) -> str:
384
+ """便捷方法: 只返回识别文本"""
385
+ results = OCRApiExtractor.extract(image_or_path, task=task)
386
+ return "\n".join(r.markdown_text for r in results)
387
+
388
+
389
+ # ============================================================
390
+ # 统一提取器入口
391
+ # ============================================================
392
+
393
+ def _extract_ocr(image_or_path: Union[str, Path, np.ndarray]) -> List[OCRResult]:
394
+ """根据配置选择 OCR 引擎并执行识别"""
395
+ if config.OCR_ENGINE == "api":
396
+ return OCRApiExtractor.extract(image_or_path)
397
+ else:
398
+ return VLOCRExtractor.extract(image_or_path)
399
+
400
+
401
+ # ============================================================
402
+ # PDF 工具
403
+ # ============================================================
404
+
405
+ class PDFUtils:
406
+ """PDF 处理工具: 渲染和元数据提取"""
407
+
408
+ @staticmethod
409
+ def render_page_to_image(page: fitz.Page, dpi: int = 300) -> np.ndarray:
410
+ """将 PyMuPDF 页面渲染为 numpy 图片数组 (RGB)"""
411
+ zoom = dpi / 72.0
412
+ matrix = fitz.Matrix(zoom, zoom)
413
+ pix = page.get_pixmap(matrix=matrix)
414
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
415
+ return np.array(img)
416
+
417
+ @staticmethod
418
+ def get_page_count(pdf_path: Path) -> int:
419
+ """获取 PDF 页数"""
420
+ doc = fitz.open(str(pdf_path))
421
+ count = len(doc)
422
+ doc.close()
423
+ return count
424
+
425
+ @staticmethod
426
+ def is_scanned_pdf(pdf_path: Path, sample_pages: int = 3) -> bool:
427
+ """
428
+ 检测 PDF 是否为扫描版 (图片型 PDF)
429
+
430
+ 通过检查前几页是否包含可提取的文本层来判断
431
+ """
432
+ doc = fitz.open(str(pdf_path))
433
+ text_chars = 0
434
+ pages_to_check = min(sample_pages, len(doc))
435
+
436
+ for i in range(pages_to_check):
437
+ text_chars += len(doc[i].get_text().strip())
438
+
439
+ doc.close()
440
+ # 如果前几页几乎没有文本, 认为是扫描版
441
+ return text_chars < 100 * pages_to_check
442
+
443
+ @staticmethod
444
+ def extract_text_layer(pdf_path: Path) -> List[Dict[str, Any]]:
445
+ """
446
+ 提取 PDF 内嵌文本层 (非 OCR, 用于数字原生 PDF)
447
+ 返回每页的文本和元数据
448
+ """
449
+ doc = fitz.open(str(pdf_path))
450
+ pages = []
451
+
452
+ for i in range(len(doc)):
453
+ page = doc[i]
454
+ text = page.get_text("text")
455
+ if text.strip():
456
+ pages.append({
457
+ "page_num": i + 1,
458
+ "text": text,
459
+ "char_count": len(text),
460
+ "has_text_layer": True,
461
+ })
462
+
463
+ doc.close()
464
+ return pages
465
+
466
+
467
+ # ============================================================
468
+ # LangChain PaddleOCR-VL-1.5 文档加载器
469
+ # ============================================================
470
+
471
+ class PaddleOCRLoader:
472
+ """
473
+ LangChain 兼容的 PaddleOCR-VL-1.5 文档加载器
474
+
475
+ 支持格式: PDF / PNG / JPG / JPEG / BMP / TIF / TIFF
476
+
477
+ 用法:
478
+ # 加载 PDF
479
+ loader = PaddleOCRLoader("document.pdf")
480
+ documents = loader.load()
481
+
482
+ # 加载图片
483
+ loader = PaddleOCRLoader("scan.png")
484
+ documents = loader.load()
485
+
486
+ # 延迟加载 (大文件推荐)
487
+ for doc in loader.lazy_load():
488
+ process(doc)
489
+ """
490
+
491
+ def __init__(
492
+ self,
493
+ file_path: Union[str, Path],
494
+ dpi: int = config.PDF_RENDER_DPI,
495
+ verbose: bool = True,
496
+ ):
497
+ self.file_path = Path(file_path)
498
+ if not self.file_path.exists():
499
+ raise FileNotFoundError(f"文件不存在: {self.file_path}")
500
+
501
+ self.suffix = self.file_path.suffix.lower()
502
+ if self.suffix not in config.SUPPORTED_FORMATS:
503
+ raise ValueError(
504
+ f"不支持的文件格式: {self.suffix}. "
505
+ f"支持: {config.SUPPORTED_FORMATS}"
506
+ )
507
+
508
+ self.dpi = dpi
509
+ self.verbose = verbose
510
+ self._doc_name = self.file_path.stem
511
+ self._is_pdf = (self.suffix == ".pdf")
512
+
513
+ def load(self) -> List[Document]:
514
+ """完整加载文档, 返回 LangChain Document 列表"""
515
+ return list(self.lazy_load())
516
+
517
+ def lazy_load(self) -> Iterator[Document]:
518
+ """逐页延迟加载"""
519
+
520
+ if self._is_pdf:
521
+ yield from self._load_pdf()
522
+ else:
523
+ yield from self._load_image()
524
+
525
+ def _load_pdf(self) -> Iterator[Document]:
526
+ """加载 PDF 文件"""
527
+ total_start = time.time()
528
+ page_count = PDFUtils.get_page_count(self.file_path)
529
+ self._log(f"开始处理 PDF: {self.file_path.name} ({page_count} 页, DPI={self.dpi})")
530
+
531
+ pdf_doc = fitz.open(str(self.file_path))
532
+
533
+ for page_idx in range(page_count):
534
+ page_start = time.time()
535
+
536
+ # 渲染页面为高清图片
537
+ page = pdf_doc[page_idx]
538
+ image = PDFUtils.render_page_to_image(page, dpi=self.dpi)
539
+
540
+ # PaddleOCR-VL-1.5 识别
541
+ results = _extract_ocr(image)
542
+
543
+ # 释放页面图像内存 (高DPI图片可能占用数百MB)
544
+ del image
545
+
546
+ ocr_time = (time.time() - page_start) * 1000
547
+
548
+ for ocr_result in results:
549
+ ocr_result.page_num = page_idx + 1
550
+ ocr_result.source_format = "pdf"
551
+
552
+ text = ocr_result.markdown_text
553
+ if not text and ocr_result.json_data:
554
+ text = self._extract_text_from_json(ocr_result.json_data)
555
+
556
+ if isinstance(text, dict):
557
+ text = text.get("text", "") or ""
558
+ if not text or not str(text).strip():
559
+ self._log(f" 第 {page_idx + 1} 页: 未检测到文本")
560
+ continue
561
+
562
+ # 构建元数据
563
+ metadata = {
564
+ "source": str(self.file_path),
565
+ "document_name": self._doc_name,
566
+ "page": page_idx + 1,
567
+ "total_pages": page_count,
568
+ "ocr_text_length": len(text),
569
+ "ocr_time_ms": round(ocr_time, 1),
570
+ "dpi": self.dpi,
571
+ "source_format": "pdf",
572
+ "tables_count": len(ocr_result.tables),
573
+ "formulas_count": len(ocr_result.formulas),
574
+ "text_blocks_count": len(ocr_result.text_blocks),
575
+ }
576
+
577
+ # 附加表格/公式数据
578
+ if ocr_result.tables:
579
+ metadata["tables_markdown"] = [
580
+ t.get("markdown", "") for t in ocr_result.tables
581
+ ]
582
+ metadata["tables_html"] = [
583
+ t.get("html", "") for t in ocr_result.tables
584
+ ]
585
+ if ocr_result.formulas:
586
+ metadata["formulas_latex"] = [
587
+ f.get("latex", "") for f in ocr_result.formulas
588
+ ]
589
+
590
+ doc = Document(page_content=text, metadata=metadata)
591
+
592
+ self._log(
593
+ f" 第 {page_idx + 1}/{page_count} 页: "
594
+ f"{len(text)} 字符, "
595
+ f"表格={metadata['tables_count']}, "
596
+ f"公式={metadata['formulas_count']}, "
597
+ f"耗时 {ocr_time:.0f}ms"
598
+ )
599
+
600
+ yield doc
601
+
602
+ pdf_doc.close()
603
+ gc.collect() # 强制回收页面渲染残留内存
604
+ self._log(f"PDF 处理完成, 总耗时 {time.time() - total_start:.1f}s")
605
+
606
+ def _load_image(self) -> Iterator[Document]:
607
+ """加载单张图片"""
608
+ total_start = time.time()
609
+ self._log(f"开始处理图片: {self.file_path.name}")
610
+
611
+ # 验证图片可读
612
+ try:
613
+ img = Image.open(self.file_path)
614
+ img.verify()
615
+ img = Image.open(self.file_path) # verify 后需重新打开
616
+ except Exception as e:
617
+ raise ValueError(f"无法读取图片文件: {e}")
618
+
619
+ # PaddleOCR-VL-1.5 可以直接接受图片路径
620
+ results = _extract_ocr(str(self.file_path))
621
+ ocr_time = (time.time() - total_start) * 1000
622
+
623
+ for ocr_result in results:
624
+ ocr_result.source_format = self.suffix.lstrip(".")
625
+ # print("ocr_result: ",ocr_result)
626
+ text = ocr_result.markdown_text
627
+
628
+ if not text and ocr_result.json_data:
629
+ text = self._extract_text_from_json(ocr_result.json_data)
630
+
631
+ if isinstance(text, dict):
632
+ text = text.get("text", "") or ""
633
+ if not text or not str(text).strip():
634
+ self._log(" 未检测到文本")
635
+ continue
636
+
637
+ metadata = {
638
+ "source": str(self.file_path),
639
+ "document_name": self._doc_name,
640
+ "page": 1,
641
+ "total_pages": 1,
642
+ "ocr_text_length": len(text),
643
+ "ocr_time_ms": round(ocr_time, 1),
644
+ "dpi": self.dpi,
645
+ "source_format": self.suffix.lstrip("."),
646
+ "image_width": img.width,
647
+ "image_height": img.height,
648
+ "tables_count": len(ocr_result.tables),
649
+ "formulas_count": len(ocr_result.formulas),
650
+ "text_blocks_count": len(ocr_result.text_blocks),
651
+ }
652
+
653
+ if ocr_result.tables:
654
+ metadata["tables_markdown"] = [
655
+ t.get("markdown", "") for t in ocr_result.tables
656
+ ]
657
+ metadata["tables_html"] = [
658
+ t.get("html", "") for t in ocr_result.tables
659
+ ]
660
+ if ocr_result.formulas:
661
+ metadata["formulas_latex"] = [
662
+ f.get("latex", "") for f in ocr_result.formulas
663
+ ]
664
+
665
+ doc = Document(page_content=text, metadata=metadata)
666
+ yield doc
667
+
668
+ self._log(f"图片处理完成, 耗时 {time.time() - total_start:.1f}s")
669
+
670
+ def load_with_ocr_results(self) -> List[OCRResult]:
671
+ """返回 OCRResult 对象列表 (包含更丰富的结构化信息)"""
672
+ if self._is_pdf:
673
+ pdf_doc = fitz.open(str(self.file_path))
674
+ all_results = []
675
+ for page_idx in range(len(pdf_doc)):
676
+ page = pdf_doc[page_idx]
677
+ image = PDFUtils.render_page_to_image(page, dpi=self.dpi)
678
+ results = _extract_ocr(image)
679
+ for r in results:
680
+ r.page_num = page_idx + 1
681
+ r.source_format = "pdf"
682
+ all_results.extend(results)
683
+ pdf_doc.close()
684
+ return all_results
685
+ else:
686
+ results = _extract_ocr(str(self.file_path))
687
+ for r in results:
688
+ r.source_format = self.suffix.lstrip(".")
689
+ return results
690
+
691
+ @staticmethod
692
+ def _extract_text_from_json(json_data: Dict) -> str:
693
+ """从 PaddleOCR-VL JSON 结构中提取所有文本"""
694
+ return VLOCRExtractor._build_text_from_blocks(json_data)
695
+
696
+ def _log(self, msg: str):
697
+ if self.verbose:
698
+ logger.info(msg)
699
+
700
+
701
+ # ============================================================
702
+ # 批量加载器
703
+ # ============================================================
704
+
705
+ class PaddleOCRDirectoryLoader:
706
+ """批量加载目录下的所有支持的文档文件"""
707
+
708
+ def __init__(
709
+ self,
710
+ directory: Union[str, Path],
711
+ glob_patterns: Optional[List[str]] = None,
712
+ **loader_kwargs,
713
+ ):
714
+ self.directory = Path(directory)
715
+ self.glob_patterns = glob_patterns or [
716
+ "**/*.pdf", "**/*.png", "**/*.jpg", "**/*.jpeg",
717
+ "**/*.bmp", "**/*.tif", "**/*.tiff",
718
+ ]
719
+ self.loader_kwargs = loader_kwargs
720
+
721
+ def load(self) -> List[Document]:
722
+ """加载目录下所有支持的文档"""
723
+ all_docs = []
724
+ files = []
725
+ for pattern in self.glob_patterns:
726
+ files.extend(self.directory.glob(pattern))
727
+ files = sorted(set(files))
728
+
729
+ if not files:
730
+ logger.warning(f"目录 {self.directory} 中未找到支持的文档文件")
731
+ return all_docs
732
+
733
+ logger.info(f"在 {self.directory} 中找到 {len(files)} 个文件")
734
+
735
+ for file_path in files:
736
+ try:
737
+ loader = PaddleOCRLoader(file_path, **self.loader_kwargs)
738
+ docs = loader.load()
739
+ all_docs.extend(docs)
740
+ logger.info(f" ✓ {file_path.name}: {len(docs)} 页/块")
741
+ except Exception as e:
742
+ logger.error(f" ✗ {file_path.name}: {e}")
743
+
744
+ logger.info(f"批量加载完成, 共 {len(all_docs)} 个文档块")
745
+ return all_docs
746
+
747
+ def lazy_load(self) -> Iterator[Document]:
748
+ """延迟加载"""
749
+ files = []
750
+ for pattern in self.glob_patterns:
751
+ files.extend(self.directory.glob(pattern))
752
+ files = sorted(set(files))
753
+
754
+ for file_path in files:
755
+ try:
756
+ loader = PaddleOCRLoader(file_path, **self.loader_kwargs)
757
+ yield from loader.lazy_load()
758
+ except Exception as e:
759
+ logger.error(f"加载失败 {file_path.name}: {e}")
760
+
761
+
762
+ # ============================================================
763
+ # 便捷函数
764
+ # ============================================================
765
+
766
+ def load_document(file_path: Union[str, Path], **kwargs) -> List[Document]:
767
+ """便捷函数: 加载单个文档 (自动识别格式)"""
768
+ loader = PaddleOCRLoader(file_path, **kwargs)
769
+ return loader.load()
770
+
771
+
772
+ def load_directory(directory: Union[str, Path], **kwargs) -> List[Document]:
773
+ """便捷函数: 加载目录下所有文档"""
774
+ loader = PaddleOCRDirectoryLoader(directory, **kwargs)
775
+ return loader.load()
776
+
777
+
778
+ def ocr_to_markdown(file_path: Union[str, Path]) -> str:
779
+ """便捷函数: OCR 识别并返回 Markdown"""
780
+ return VLOCRExtractor.extract_to_markdown(file_path)
781
+
782
+
783
+ def ocr_to_json(file_path: Union[str, Path], save_path: Optional[str] = None) -> Dict:
784
+ """便捷函数: OCR 识别并返回 JSON"""
785
+ return VLOCRExtractor.extract_to_json(file_path, save_path)
786
+
787
+
788
+ # ============================================================
789
+ # 测试入口
790
+ # ============================================================
791
+
792
+ if __name__ == "__main__":
793
+ import sys
794
+
795
+ if len(sys.argv) < 2:
796
+ print(f"用法: python {__file__} <file_path> [--json] [--md]")
797
+ print(f"支持格式: {config.SUPPORTED_FORMATS}")
798
+ sys.exit(1)
799
+
800
+ file_path = sys.argv[1]
801
+ output_mode = "doc" # doc / json / md
802
+ if "--json" in sys.argv:
803
+ output_mode = "json"
804
+ elif "--md" in sys.argv:
805
+ output_mode = "md"
806
+
807
+ loader = PaddleOCRLoader(file_path, verbose=True)
808
+
809
+ if output_mode == "json":
810
+ result = ocr_to_json(file_path)
811
+ import json
812
+ print(json.dumps(result, ensure_ascii=False, indent=2)[:5000])
813
+ elif output_mode == "md":
814
+ md = ocr_to_markdown(file_path)
815
+ print(md[:5000])
816
+ else:
817
+ documents = loader.load()
818
+ print(f"\n{'='*60}")
819
+ print(f"共加载 {len(documents)} 页/文档")
820
+ print(f"{'='*60}")
821
+ for i, doc in enumerate(documents):
822
+ print(f"\n--- 第 {doc.metadata.get('page', '?')} 页 "
823
+ f"({len(doc.page_content)} 字符) ---")
824
+ print(doc.page_content[:500])
825
+ if len(doc.page_content) > 500:
826
+ print("...")
827
+ print(f" 元数据: source={doc.metadata.get('document_name')}, "
828
+ f"tables={doc.metadata.get('tables_count', 0)}, "
829
+ f"formulas={doc.metadata.get('formulas_count', 0)}")
rag_chain.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ RAG 检索增强生成问答链
4
+ ============================================================
5
+ LLM: Qwen3-8B (通过 OpenAI 兼容 API 调用)
6
+ 嵌入: Qwen3-Embedding (通过 OpenAI 兼容 API 调用)
7
+
8
+ 所有模型均通过 API 调用, 无需本地推理:
9
+ - Embedding API: /v1/embeddings
10
+ - LLM API: /v1/chat/completions
11
+
12
+ 支持任意 OpenAI 兼容 API:
13
+ - vLLM 部署的 Qwen3 / Llama / DeepSeek 等
14
+ - 第三方 API (DeepSeek, 通义千问, 智谱 GLM 等)
15
+ - OpenAI 官方 API
16
+
17
+ 功能:
18
+ 1. LangChain LCEL RAG 问答链
19
+ 2. 多轮对话
20
+ 3. 流式输出
21
+ 4. 来源引用
22
+ """
23
+
24
+ from typing import List, Optional, Dict, Any, Iterator
25
+
26
+ from langchain_core.documents import Document
27
+ from langchain_core.prompts import ChatPromptTemplate
28
+ from langchain_core.runnables import RunnableParallel
29
+ from langchain_core.output_parsers import StrOutputParser
30
+ from langchain_core.language_models import BaseChatModel
31
+ from langchain_core.messages import HumanMessage, SystemMessage
32
+
33
+ from langchain_openai import ChatOpenAI
34
+
35
+ from loguru import logger
36
+
37
+ import config
38
+ from vector_store import VectorStoreManager
39
+
40
+
41
+ # ============================================================
42
+ # LLM 工厂 (纯 API 模式)
43
+ # ============================================================
44
+
45
+ def create_llm(
46
+ model_name: Optional[str] = None,
47
+ api_base: Optional[str] = None,
48
+ api_key: Optional[str] = None,
49
+ temperature: Optional[float] = None,
50
+ max_tokens: Optional[int] = None,
51
+ ) -> ChatOpenAI:
52
+ """
53
+ 创建 OpenAI 兼容的 LLM 实例
54
+
55
+ Args:
56
+ model_name: 模型名称, 如 Qwen/Qwen3-8B
57
+ api_base: API 地址
58
+ api_key: API Key
59
+ temperature: 生成温度
60
+ max_tokens: 最大输出 token 数
61
+
62
+ Returns:
63
+ ChatOpenAI 实例
64
+ """
65
+ return ChatOpenAI(
66
+ model=model_name or config.LLM_MODEL_NAME,
67
+ api_key=api_key or config.LLM_API_KEY,
68
+ base_url=api_base or config.LLM_API_BASE,
69
+ temperature=temperature or config.LLM_TEMPERATURE,
70
+ max_tokens=max_tokens or config.LLM_MAX_TOKENS,
71
+ )
72
+
73
+
74
+ # ============================================================
75
+ # RAG 问答链
76
+ # ============================================================
77
+
78
+ class RAGChain:
79
+ """
80
+ RAG 检索增强生成链
81
+
82
+ 流程:
83
+ Query → Embedding API 检索 → 上下文格式化 →
84
+ Prompt 模板 → LLM API 生成 → 结构化回答 (含来源)
85
+
86
+ 用法:
87
+ rag = RAGChain(vector_store_manager)
88
+ result = rag.query("文档主要内容是什么?")
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ vector_store_manager: VectorStoreManager,
94
+ llm: Optional[BaseChatModel] = None,
95
+ top_k: int = config.RETRIEVAL_TOP_K,
96
+ system_prompt: Optional[str] = None,
97
+ search_type: str = "similarity",
98
+ ):
99
+ self.vector_store_manager = vector_store_manager
100
+ self.llm = llm or create_llm()
101
+ self.top_k = top_k
102
+ self.system_prompt = system_prompt or config.SYSTEM_PROMPT
103
+ self.search_type = search_type
104
+ self._chain = self._build_chain()
105
+
106
+ logger.info(
107
+ f"RAG 问答链初始化完成 (LLM={config.LLM_MODEL_NAME}, "
108
+ f"top_k={top_k}, search={search_type})"
109
+ )
110
+
111
+ def _build_chain(self):
112
+ """使用 LangChain LCEL 构建 RAG 链"""
113
+ prompt = ChatPromptTemplate.from_messages([
114
+ ("system", "{system_prompt}"),
115
+ ("human", config.RAG_PROMPT_TEMPLATE),
116
+ ])
117
+
118
+ chain = (
119
+ RunnableParallel({
120
+ "context": lambda inputs: self._retrieve_and_format(inputs["query"]),
121
+ "question": lambda inputs: inputs["query"],
122
+ "system_prompt": lambda _: self.system_prompt,
123
+ })
124
+ | prompt
125
+ | self.llm
126
+ | StrOutputParser()
127
+ )
128
+
129
+ return chain
130
+
131
+ def _retrieve_and_format(self, query: str) -> str:
132
+ docs = self._retrieve(query)
133
+ return self._format_docs(docs)
134
+
135
+ def _retrieve(self, query: str) -> List[Document]:
136
+ if self.search_type == "mmr":
137
+ return self.vector_store_manager.max_marginal_relevance_search(
138
+ query, k=self.top_k
139
+ )
140
+ elif self.search_type == "similarity_score":
141
+ results = self.vector_store_manager.similarity_search_with_score(
142
+ query, k=self.top_k
143
+ )
144
+ return [doc for doc, _ in results]
145
+ else:
146
+ return self.vector_store_manager.similarity_search(query, k=self.top_k)
147
+
148
+ MAX_CONTEXT_CHARS = 1800 # 总上下文字符上限 (适配小显存 1152 token 限制)
149
+
150
+ @classmethod
151
+ def _format_docs(cls, docs: List[Document]) -> str:
152
+ if not docs:
153
+ return "(未找到相关文档内容)"
154
+
155
+ # 控制每个文档块长度,避免超过小显存的 token 限制
156
+ max_chunk_chars = cls.MAX_CONTEXT_CHARS // max(len(docs), 1)
157
+
158
+ parts = []
159
+ for i, doc in enumerate(docs, 1):
160
+ page = doc.metadata.get("page", "未知")
161
+ doc_name = doc.metadata.get("document_name", "未知文档")
162
+
163
+ content = doc.page_content
164
+ if len(content) > max_chunk_chars:
165
+ content = content[:max_chunk_chars] + "..."
166
+
167
+ header = f"[{i}] {doc_name} p{page}"
168
+ parts.append(f"{header}\n{content}")
169
+
170
+ return "\n\n---\n\n".join(parts)
171
+
172
+ # ---- 查询接口 ----
173
+
174
+ def query(self, question: str) -> Dict[str, Any]:
175
+ """
176
+ 单次问答
177
+
178
+ Returns:
179
+ {"query": str, "answer": str, "sources": [...], "context": str}
180
+ """
181
+ logger.info(f"RAG 查询: {question[:100]}...")
182
+
183
+ retrieved_docs = self._retrieve(question)
184
+ answer = self._chain.invoke({"query": question})
185
+
186
+ sources = self._build_sources(retrieved_docs)
187
+
188
+ logger.info(f"生成完成: {len(answer)} 字符, {len(sources)} 个来源")
189
+ return {
190
+ "query": question,
191
+ "answer": answer,
192
+ "sources": sources,
193
+ "context": self._format_docs(retrieved_docs),
194
+ }
195
+
196
+ def query_stream(self, question: str) -> Iterator[str]:
197
+ """流式问答"""
198
+ logger.info(f"RAG 流式查询: {question[:100]}...")
199
+ for chunk in self._chain.stream({"query": question}):
200
+ yield chunk
201
+
202
+ def query_with_history(
203
+ self,
204
+ question: str,
205
+ chat_history: Optional[List[Dict[str, str]]] = None,
206
+ ) -> Dict[str, Any]:
207
+ """带对话历史的多轮问答"""
208
+ chat_history = chat_history or []
209
+
210
+ history_context = self._format_history(chat_history)
211
+ retrieved_docs = self._retrieve(question)
212
+ context = self._format_docs(retrieved_docs)
213
+
214
+ messages = [
215
+ SystemMessage(content=(
216
+ f"{self.system_prompt}\n\n"
217
+ f"## 对话历史:\n{history_context}"
218
+ )),
219
+ HumanMessage(content=config.RAG_PROMPT_TEMPLATE.format(
220
+ system_prompt="",
221
+ context=context,
222
+ question=question,
223
+ )),
224
+ ]
225
+
226
+ response = self.llm.invoke(messages)
227
+ answer = response.content
228
+
229
+ return {
230
+ "query": question,
231
+ "answer": answer,
232
+ "sources": self._build_sources(retrieved_docs),
233
+ "context": context,
234
+ }
235
+
236
+ @staticmethod
237
+ def _build_sources(docs: List[Document]) -> List[Dict[str, Any]]:
238
+ return [
239
+ {
240
+ "rank": i,
241
+ "content": doc.page_content[:300],
242
+ "page": doc.metadata.get("page", "未知"),
243
+ "document": doc.metadata.get("document_name", "未知"),
244
+ "content_type": doc.metadata.get("content_type", "text"),
245
+ }
246
+ for i, doc in enumerate(docs, 1)
247
+ ]
248
+
249
+ @staticmethod
250
+ def _format_history(chat_history: List[Dict[str, str]]) -> str:
251
+ if not chat_history:
252
+ return "(无历史对话)"
253
+ parts = []
254
+ for turn in chat_history[-8:]: # 仅保留最近 4 轮对话
255
+ role = "用户" if turn.get("role") == "user" else "助手"
256
+ parts.append(f"{role}: {turn.get('content', '')}")
257
+ return "\n".join(parts)
258
+
259
+
260
+ # ============================================================
261
+ # PDF 完整问答流水线
262
+ # ============================================================
263
+
264
+ class PDFRAGPipeline:
265
+ """
266
+ PDF 智能问答完整流水线 (全 API 模式)
267
+
268
+ 一步完成: 文档上传 → OCR → 清洗 → 分割 → API嵌入 → 入库 → API问答
269
+
270
+ 用法:
271
+ pipeline = PDFRAGPipeline()
272
+ pipeline.ingest("document.pdf")
273
+ result = pipeline.ask("文档主要内容是什么?")
274
+ """
275
+
276
+ def __init__(
277
+ self,
278
+ llm: Optional[BaseChatModel] = None,
279
+ store_type: Optional[str] = None,
280
+ chunk_size: int = config.CHUNK_SIZE,
281
+ chunk_overlap: int = config.CHUNK_OVERLAP,
282
+ verbose: bool = True,
283
+ ):
284
+ self.llm = llm or create_llm()
285
+ self.store_type = store_type or config.VECTOR_STORE_TYPE
286
+ self.chunk_size = chunk_size
287
+ self.chunk_overlap = chunk_overlap
288
+ self.verbose = verbose
289
+
290
+ self._vector_store_manager: Optional[VectorStoreManager] = None
291
+ self._rag_chain: Optional[RAGChain] = None
292
+
293
+ def ingest(self, file_path: str, clear_existing: bool = True) -> int:
294
+ """
295
+ 处理文档并构建向量数据库
296
+
297
+ 支持格式: PDF / PNG / JPG / BMP / TIF
298
+ """
299
+ from ocr_loader import PaddleOCRLoader
300
+ from text_processor import TextProcessingPipeline
301
+
302
+ logger.info(f"开始入库: {file_path}")
303
+
304
+ # Step 1: OCR
305
+ self._log("Step 1/4: PaddleOCR-VL-1.5 识别...")
306
+ loader = PaddleOCRLoader(file_path, verbose=False)
307
+ raw_docs = loader.load()
308
+ self._log(f" ✓ 识别完成: {len(raw_docs)} 页/文档")
309
+
310
+ # Step 2: 处理
311
+ self._log("Step 2/4: 文本清洗与分割...")
312
+ pipeline = TextProcessingPipeline(
313
+ chunk_size=self.chunk_size,
314
+ chunk_overlap=self.chunk_overlap,
315
+ )
316
+ chunks = pipeline.process(raw_docs)
317
+ self._log(f" ✓ 分割完成: {len(chunks)} 个文本块")
318
+
319
+ # Step 3: 向量化 (通过 Embedding API)
320
+ self._log("Step 3/4: Embedding API 向量化...")
321
+ self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
322
+ if clear_existing:
323
+ self._vector_store_manager.clear()
324
+ chunk_count = self._vector_store_manager.add_documents(chunks)
325
+ self._log(f" ✓ 入库完成: {chunk_count} 个文本块")
326
+
327
+ # Step 4: 初始化 RAG
328
+ self._log("Step 4/4: 初始化 RAG 引擎...")
329
+ self._rag_chain = RAGChain(
330
+ vector_store_manager=self._vector_store_manager,
331
+ llm=self.llm,
332
+ )
333
+ self._log(" ✓ 问答引擎就绪")
334
+ self._log("入库完成! 可以开始提问。")
335
+
336
+ return chunk_count
337
+
338
+ def ingest_multiple(self, file_paths: List[str], clear_existing: bool = True) -> int:
339
+ total = 0
340
+ for i, fp in enumerate(file_paths):
341
+ total += self.ingest(fp, clear_existing=(clear_existing and i == 0))
342
+ return total
343
+
344
+ def ask(self, question: str) -> Dict[str, Any]:
345
+ if self._rag_chain is None:
346
+ self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
347
+ if self._vector_store_manager.get_document_count() == 0:
348
+ raise RuntimeError("向量数据库为空! 请先调用 ingest() 处理文档。")
349
+ self._rag_chain = RAGChain(
350
+ vector_store_manager=self._vector_store_manager,
351
+ llm=self.llm,
352
+ )
353
+ return self._rag_chain.query(question)
354
+
355
+ def ask_stream(self, question: str) -> Iterator[str]:
356
+ if self._rag_chain is None:
357
+ raise RuntimeError("请先调用 ingest() 处理文档。")
358
+ return self._rag_chain.query_stream(question)
359
+
360
+ def ask_with_history(
361
+ self, question: str,
362
+ chat_history: Optional[List[Dict[str, str]]] = None,
363
+ ) -> Dict[str, Any]:
364
+ if self._rag_chain is None:
365
+ raise RuntimeError("请先调用 ingest() 处理文档。")
366
+ return self._rag_chain.query_with_history(question, chat_history)
367
+
368
+ @property
369
+ def is_ready(self) -> bool:
370
+ try:
371
+ if self._vector_store_manager is None:
372
+ self._vector_store_manager = VectorStoreManager(store_type=self.store_type)
373
+ return self._vector_store_manager.get_document_count() > 0
374
+ except Exception:
375
+ return False
376
+
377
+ @property
378
+ def stats(self) -> Dict[str, Any]:
379
+ if self._vector_store_manager is None:
380
+ return {"status": "not_initialized"}
381
+ return self._vector_store_manager.get_stats()
382
+
383
+ def _log(self, msg: str):
384
+ if self.verbose:
385
+ print(msg)
386
+
387
+
388
+ # ============================================================
389
+ # 便捷函数
390
+ # ============================================================
391
+
392
+ def quick_qa(file_path: str, question: str) -> Dict[str, Any]:
393
+ """便捷函数: 直接对文档提问 (一次性)"""
394
+ from ocr_loader import PaddleOCRLoader
395
+ from text_processor import TextProcessingPipeline
396
+ from vector_store import build_vector_store
397
+
398
+ loader = PaddleOCRLoader(file_path, verbose=False)
399
+ raw_docs = loader.load()
400
+ pipeline = TextProcessingPipeline()
401
+ chunks = pipeline.process(raw_docs)
402
+ manager = build_vector_store(chunks, clear_existing=True)
403
+ chain = RAGChain(vector_store_manager=manager)
404
+ return chain.query(question)
405
+
406
+
407
+ # ============================================================
408
+ # 测试入口
409
+ # ============================================================
410
+
411
+ if __name__ == "__main__":
412
+ import sys
413
+
414
+ if len(sys.argv) < 3:
415
+ print(f"用法: python {__file__} <file_path> <question>")
416
+ print(f"示例: python {__file__} document.pdf '文档主要内容是什么?'")
417
+ sys.exit(1)
418
+
419
+ file_path = sys.argv[1]
420
+ question = sys.argv[2]
421
+
422
+ print(f"\n{'='*60}")
423
+ print(f" PDF/文档 智能问答测试")
424
+ print(f" 文件: {file_path}")
425
+ print(f" 问题: {question}")
426
+ print(f"{'='*60}")
427
+
428
+ result = quick_qa(file_path, question)
429
+
430
+ print(f"\n{'='*60}")
431
+ print(f" 回答:")
432
+ print(f"{'='*60}")
433
+ print(result["answer"])
434
+
435
+ print(f"\n{'='*60}")
436
+ print(f" 参考来源:")
437
+ print(f"{'='*60}")
438
+ for src in result["sources"]:
439
+ print(f" [{src['rank']}] {src['document']} 第{src['page']}页 ({src['content_type']})")
440
+ print(f" {src['content'][:150]}...")
requirements.txt CHANGED
@@ -1,18 +1,32 @@
1
- python-dotenv==1.2.1
 
 
 
 
2
 
3
- numpy==2.2.6
4
- Pillow==12.0.0
5
- fastapi==0.135.1
6
- uvicorn==0.41.0
7
- openai==2.20.0
8
- requests==2.32.5
9
- pydantic==2.12.5
10
- gradio==6.8.0
11
- moviepy==1.0.3
12
- opencv-python
13
- modelscope==1.34.0
14
- qwen-vl-utils==0.0.14
15
- funasr==1.3.1
16
- nano-vectordb==0.0.4.3
17
- tqdm==4.67.3
18
- soundfile
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # PDF OCR 智能问答系统 依赖
3
+ # 模型栈: PaddleOCR-VL-0.9B (API) + Qwen3-Embedding-0.6B (API) + Qwen3-1.7B (API)#
4
+ # 需事先启动 vLLM 或其他兼容 API 服务
5
+ # ============================================================
6
 
7
+
8
+ # --- PDF & 图片处理 ---
9
+ PyMuPDF>=1.24.0
10
+ Pillow>=10.0.0
11
+ numpy>=1.24.0
12
+
13
+ # --- LangChain 生态 ---
14
+ langchain>=0.3.0
15
+ langchain-core>=0.3.0
16
+ langchain-community>=0.3.0
17
+ langchain-text-splitters>=0.3.0
18
+ langchain-openai>=0.2.0 # OpenAI 兼容 API 客户端 (Embedding + LLM)
19
+
20
+ # --- 向量数据库 ---
21
+ chromadb>=0.5.0
22
+ # faiss-cpu (可选)
23
+
24
+ # --- Web UI ---
25
+ fastapi>=0.110.0
26
+ uvicorn>=0.29.0
27
+ python-multipart>=0.0.9
28
+
29
+ # --- 工具 ---
30
+ python-dotenv>=1.0.0
31
+ tqdm>=4.66.0
32
+ loguru>=0.7.0
run.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ============================================================
4
+ PDF OCR 智能问答系统 — 端到端运行脚本
5
+ ============================================================
6
+
7
+ 用法:
8
+ # 交互模式: 处理文档后进入问答 REPL
9
+ python run.py -f document.pdf
10
+
11
+ # 单次问答
12
+ python run.py -f document.pdf -q "文档主要内容是什么?"
13
+
14
+ # 批量处理多个文档
15
+ python run.py -f doc1.pdf doc2.png scan3.jpg
16
+
17
+ # 指定分块参数
18
+ python run.py -f document.pdf --chunk-size 1000 --chunk-overlap 200
19
+
20
+ # 从已有向量库加载 (跳过 OCR, 直接问答)
21
+ python run.py --load
22
+
23
+ # 清空旧数据重新处理
24
+ python run.py -f document.pdf --clear
25
+
26
+ # 显示检索到的原文
27
+ python run.py -f document.pdf -q "问题" --show-sources
28
+
29
+ 环境变量 (或 .env 文件):
30
+ EMBEDDING_API_BASE Embedding API 地址
31
+ EMBEDDING_MODEL_NAME Embedding 模型名
32
+ LLM_API_BASE LLM API 地址
33
+ LLM_API_KEY LLM API Key
34
+ LLM_MODEL_NAME LLM 模型名
35
+ """
36
+
37
+ import argparse
38
+ import json
39
+ import os
40
+ import sys
41
+ import time
42
+ from pathlib import Path
43
+ from typing import List, Optional
44
+
45
+ # ---- 环境补丁 (必须在其他导入之前) ----
46
+ def _patch():
47
+ import types as _types
48
+ if "langchain_text_splitters" not in sys.modules:
49
+ m = _types.ModuleType("langchain_text_splitters")
50
+ m.__path__ = []
51
+ sys.modules["langchain_text_splitters"] = m
52
+ try:
53
+ import torch # noqa: F401
54
+ except ImportError:
55
+ pass
56
+
57
+
58
+ _patch()
59
+
60
+ # 项目导入
61
+ sys.path.insert(0, str(Path(__file__).resolve().parent))
62
+
63
+ import config
64
+ from ocr_loader import PaddleOCRLoader
65
+ from text_processor import TextProcessingPipeline, RecursiveCharacterTextSplitter
66
+ from embeddings import get_embedding_model
67
+ from vector_store import VectorStoreManager, build_vector_store
68
+ from rag_chain import RAGChain, create_llm, PDFRAGPipeline
69
+
70
+ # 将内置分割器注入到 mock 模块
71
+ import sys as _sys
72
+ _lts = _sys.modules.get("langchain_text_splitters")
73
+ if _lts is not None:
74
+ _lts.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter
75
+
76
+ from loguru import logger
77
+
78
+ # ============================================================
79
+ # Banner
80
+ # ============================================================
81
+
82
+ BANNER = r"""
83
+ ┌──────────────────────────────────────────────────────┐
84
+ │ 📄 PDF OCR 智能问答系统 │
85
+ │ │
86
+ │ OCR: PaddleOCR-VL-1.5 (本地) │
87
+ │ 嵌入: {emb_model} │
88
+ │ LLM: {llm_model} │
89
+ │ 向量库: {vec_store} │
90
+ └──────────────────────────────────────────────────────┘
91
+ """
92
+
93
+
94
+ def print_banner():
95
+ emb_name = config.EMBEDDING_MODEL_NAME
96
+ llm_name = config.LLM_MODEL_NAME
97
+ vs = config.VECTOR_STORE_TYPE
98
+ # 截断过长的模型名
99
+ if len(emb_name) > 35:
100
+ emb_name = emb_name[:32] + "..."
101
+ if len(llm_name) > 35:
102
+ llm_name = llm_name[:32] + "..."
103
+ print(BANNER.format(emb_model=emb_name, llm_model=llm_name, vec_store=vs))
104
+
105
+
106
+ # ============================================================
107
+ # 步骤函数
108
+ # ============================================================
109
+
110
+ def _save_documents(docs: list, path: Path, label: str = "文档"):
111
+ """将 LangChain Document 列表保存为 JSON"""
112
+ path.parent.mkdir(parents=True, exist_ok=True)
113
+ data = []
114
+ for doc in docs:
115
+ data.append({
116
+ "page_content": doc.page_content,
117
+ "metadata": {k: v for k, v in doc.metadata.items()
118
+ if isinstance(v, (str, int, float, bool, type(None)))}
119
+ })
120
+ with open(path, "w", encoding="utf-8") as f:
121
+ json.dump(data, f, ensure_ascii=False, indent=2)
122
+ print(f" 💾 {label}已保存: {path} ({len(data)} 条)")
123
+
124
+
125
+ def step_ocr(file_paths: List[str], output_dir: Optional[Path] = None) -> list:
126
+ """Step 1: OCR 识别所有文件, 全部结果合并保存到一个文件"""
127
+ all_docs = []
128
+ for fp in file_paths:
129
+ fp = Path(fp)
130
+ if not fp.exists():
131
+ logger.error(f"文件不存在: {fp}")
132
+ continue
133
+ suffix = fp.suffix.lower()
134
+ if suffix not in config.SUPPORTED_FORMATS:
135
+ logger.warning(f"跳过不支持格式: {fp} (支持: {config.SUPPORTED_FORMATS})")
136
+ continue
137
+
138
+ icon = "📄" if suffix == ".pdf" else "🖼️"
139
+ print(f" {icon} 正在识别: {fp.name} ...", end=" ", flush=True)
140
+ t0 = time.time()
141
+ loader = PaddleOCRLoader(str(fp), verbose=True)
142
+ docs = loader.load()
143
+ elapsed = time.time() - t0
144
+ print(f"{len(docs)} 页/文档 ({elapsed:.1f}s)")
145
+ all_docs.extend(docs)
146
+
147
+ # 所有文件识别完后统一保存
148
+ if output_dir and all_docs:
149
+ save_path = output_dir / "ocr_results.json"
150
+ _save_documents(all_docs, save_path, "OCR结果 ")
151
+
152
+ return all_docs
153
+
154
+
155
+ def step_process(
156
+ documents: list, chunk_size: int, chunk_overlap: int,
157
+ output_dir: Optional[Path] = None
158
+ ) -> list:
159
+ """Step 2: 文本清洗 + 分割, 全部结果合并保存到一个文件"""
160
+ print(f" ✂️ 正在分割: {len(documents)} 个文档 ...", end=" ", flush=True)
161
+ t0 = time.time()
162
+ pipeline = TextProcessingPipeline(
163
+ chunk_size=chunk_size,
164
+ chunk_overlap=chunk_overlap,
165
+ )
166
+ chunks = pipeline.process(documents)
167
+ elapsed = time.time() - t0
168
+ print(f"→ {len(chunks)} 个文本块 ({elapsed:.1f}s)")
169
+
170
+ if output_dir and chunks:
171
+ save_path = output_dir / "chunks.json"
172
+ _save_documents(chunks, save_path, "分块结果 ")
173
+
174
+ return chunks
175
+
176
+
177
+ def step_embed(chunks: list) -> VectorStoreManager:
178
+ """Step 3: 向量嵌入 + 入库"""
179
+ print(f" 🧠 正在向量化: {len(chunks)} 个文本块 ...", end=" ", flush=True)
180
+ t0 = time.time()
181
+ manager = build_vector_store(chunks, clear_existing=True)
182
+ elapsed = time.time() - t0
183
+ print(f"完成 ({elapsed:.1f}s)")
184
+ return manager
185
+
186
+
187
+ def step_rag(manager: VectorStoreManager):
188
+ """Step 4: 初始化 RAG 链"""
189
+ llm = create_llm()
190
+ chain = RAGChain(vector_store_manager=manager, llm=llm)
191
+ return chain
192
+
193
+
194
+ # ============================================================
195
+ # 核心流程
196
+ # ============================================================
197
+
198
+ def run_ingest(
199
+ file_paths: List[str],
200
+ chunk_size: int = config.CHUNK_SIZE,
201
+ chunk_overlap: int = config.CHUNK_OVERLAP,
202
+ clear: bool = True,
203
+ output_dir: Optional[Path] = None,
204
+ ) -> VectorStoreManager:
205
+ """完整入库流程: OCR → 处理 → 嵌入 → 入库"""
206
+ print("\n" + "─" * 55)
207
+ print(" 📥 阶段 1: 文档入库")
208
+ print("─" * 55)
209
+
210
+ # Step 1: OCR
211
+ t_start = time.time()
212
+ documents = step_ocr(file_paths, output_dir=output_dir)
213
+ if not documents:
214
+ logger.error("未识别到任何文本内容, 请检查文件是否包含可读文字")
215
+ sys.exit(1)
216
+ print(f" 总计: {len(documents)} 个原始文档页")
217
+
218
+ # Step 2: 处理
219
+ chunks = step_process(documents, chunk_size, chunk_overlap,
220
+ output_dir=output_dir)
221
+
222
+ # Step 3: 嵌入入库
223
+ manager = step_embed(chunks)
224
+
225
+ total_time = time.time() - t_start
226
+ print(f"\n ✅ 入库完成 (总耗时 {total_time:.1f}s)")
227
+ print(f" 文档: {len(documents)} 页 → {len(chunks)} 个文本块")
228
+ print(f" 向量维度: {config.EMBEDDING_MODEL_NAME}")
229
+ print(f" 存储: {config.VECTOR_STORE_TYPE} @ {config.VECTOR_DB_DIR}")
230
+
231
+ return manager
232
+
233
+
234
+ def run_qa(chain: RAGChain, question: str, show_sources: bool = False):
235
+ """执行单次问答"""
236
+ print("\n" + "─" * 55)
237
+ print(f" ❓ 问题: {question}")
238
+ print("─" * 55)
239
+
240
+ t0 = time.time()
241
+ result = chain.query(question)
242
+ elapsed = time.time() - t0
243
+
244
+ print(f"\n 🤖 回答 ({elapsed:.1f}s):")
245
+ print("─" * 55)
246
+ print(result["answer"])
247
+
248
+ if show_sources:
249
+ print(f"\n 📚 参考来源 ({len(result['sources'])} 条):")
250
+ print("─" * 55)
251
+ for src in result["sources"]:
252
+ print(f" [{src['rank']}] {src['document']} | 第{src['page']}页 "
253
+ f"| {src['content_type']}")
254
+ print(f" {src['content'][:120]}...")
255
+
256
+ return result
257
+
258
+
259
+ def run_repl(chain: RAGChain):
260
+ """交互式问答 REPL"""
261
+ print("\n" + "─" * 55)
262
+ print(" 💬 交互问答模式")
263
+ print("─" * 55)
264
+ print(" 输入问题后回车, 输入 :s 切换来源显示")
265
+ print(" 输入 :q 退出, :c 清屏, :h 帮助")
266
+ print("─" * 55)
267
+
268
+ chat_history = []
269
+ show_sources = False
270
+
271
+ while True:
272
+ try:
273
+ user_input = input("\n 🔍 > ").strip()
274
+ except (EOFError, KeyboardInterrupt):
275
+ print("\n 再见! 👋")
276
+ break
277
+
278
+ if not user_input:
279
+ continue
280
+
281
+ # 命令处理
282
+ if user_input.startswith(":"):
283
+ cmd = user_input[1:].strip().lower()
284
+ if cmd in ("q", "quit", "exit"):
285
+ print(" 再见! 👋")
286
+ break
287
+ elif cmd == "s":
288
+ show_sources = not show_sources
289
+ print(f" 来源显示: {'开启' if show_sources else '关闭'}")
290
+ elif cmd == "c":
291
+ os.system("clear" if os.name != "nt" else "cls")
292
+ elif cmd == "h":
293
+ print(" 命令: :q 退出 | :s 切换来源 | :c 清屏 | :h 帮助")
294
+ else:
295
+ print(f" 未知命令: {user_input}")
296
+ continue
297
+
298
+ # 问答
299
+ t0 = time.time()
300
+ result = chain.query_with_history(user_input, chat_history)
301
+ elapsed = time.time() - t0
302
+
303
+ print(f"\n 🤖 ({elapsed:.1f}s):")
304
+ print(f" {result['answer']}")
305
+
306
+ if show_sources:
307
+ print(f"\n 📚 来源 ({len(result['sources'])} 条):")
308
+ for src in result["sources"]:
309
+ print(f" [{src['rank']}] {src['document']} "
310
+ f"第{src['page']}页 | {src['content_type']}")
311
+
312
+ chat_history.append({"role": "user", "content": user_input})
313
+ chat_history.append({"role": "assistant", "content": result["answer"]})
314
+
315
+
316
+ # ============================================================
317
+ # API 连通性检查
318
+ # ============================================================
319
+
320
+ def check_apis() -> bool:
321
+ """检查 Embedding API 和 LLM API 是否可达"""
322
+ import urllib.request
323
+
324
+ all_ok = True
325
+
326
+ # 检查 Embedding API
327
+ emb_url = config.EMBEDDING_API_BASE.rstrip("/")
328
+ try:
329
+ req = urllib.request.Request(f"{emb_url}/models", method="HEAD")
330
+ urllib.request.urlopen(req, timeout=5)
331
+ print(f" ✅ Embedding API: {emb_url}")
332
+ except Exception as e:
333
+ print(f" ⚠️ Embedding API: {emb_url} — {e}")
334
+ all_ok = False
335
+
336
+ # 检查 LLM API
337
+ llm_url = config.LLM_API_BASE.rstrip("/")
338
+ try:
339
+ req = urllib.request.Request(f"{llm_url}/models", method="HEAD")
340
+ urllib.request.urlopen(req, timeout=5)
341
+ print(f" ✅ LLM API: {llm_url}")
342
+ except Exception as e:
343
+ print(f" ⚠️ LLM API: {llm_url} — {e}")
344
+ all_ok = False
345
+
346
+ return all_ok
347
+
348
+
349
+ # ============================================================
350
+ # 主入口
351
+ # ============================================================
352
+
353
+ def main():
354
+ parser = argparse.ArgumentParser(
355
+ description="PDF OCR 智能问答系统 — 端到端运行脚本",
356
+ formatter_class=argparse.RawDescriptionHelpFormatter,
357
+ epilog="""
358
+ 示例:
359
+ python run.py -f document.pdf # 交互问答
360
+ python run.py -f doc.pdf -q "主要内容?" # 单次问答
361
+ python run.py -f a.pdf b.png --clear # 批量处理
362
+ python run.py --load # 加载已有向量库
363
+ """,
364
+ )
365
+ parser.add_argument(
366
+ "-f", "--files", nargs="+",
367
+ default=["/data/huangjie/Project/dProject/pdfocr/过滤网modify.pdf",
368
+ "/data/huangjie/Project/dProject/pdfocr/videoagent.png",
369
+ "/data/huangjie/Project/dProject/pdfocr/biaozhun.jpg"],
370
+ help="要处理的文档路径 (PDF/PNG/JPG/BMP/TIF)",
371
+ )
372
+ parser.add_argument(
373
+ "-q", "--question",
374
+ help="单次问答 (不进入交互模式)",
375
+ )
376
+ parser.add_argument(
377
+ "--load", action="store_true",
378
+ help="加载已有向量库, 跳过 OCR 处理",
379
+ )
380
+ parser.add_argument(
381
+ "--clear", action="store_true",
382
+ help="清空旧向量库数据后重新处理",
383
+ )
384
+ parser.add_argument(
385
+ "--chunk-size", type=int, default=config.CHUNK_SIZE,
386
+ help=f"文本块大小 (默认: {config.CHUNK_SIZE})",
387
+ )
388
+ parser.add_argument(
389
+ "--chunk-overlap", type=int, default=config.CHUNK_OVERLAP,
390
+ help=f"块间重叠字符数 (默认: {config.CHUNK_OVERLAP})",
391
+ )
392
+ parser.add_argument(
393
+ "--show-sources", action="store_true",
394
+ help="在回答中显示参考来源",
395
+ )
396
+ parser.add_argument(
397
+ "--top-k", type=int, default=config.RETRIEVAL_TOP_K,
398
+ help=f"检索返回文档数 (默认: {config.RETRIEVAL_TOP_K})",
399
+ )
400
+ parser.add_argument(
401
+ "--skip-api-check", action="store_true",
402
+ help="跳过 API 连通性检查",
403
+ )
404
+ parser.add_argument(
405
+ "--output-dir", type=str, default=None,
406
+ help=f"中间结果保存目录 (默认: {config.OCR_OUTPUT_DIR})",
407
+ )
408
+
409
+ args = parser.parse_args()
410
+
411
+ # Banner
412
+ print_banner()
413
+
414
+ # API 检查
415
+ if not args.skip_api_check:
416
+ print(" 🔌 API 连通性检查:")
417
+ check_apis()
418
+ print()
419
+
420
+ # 模式判断
421
+ if args.load:
422
+ # 加载已有向量库
423
+ print(" 📂 加载已有向量库...")
424
+ manager = VectorStoreManager(store_type=config.VECTOR_STORE_TYPE)
425
+ count = manager.get_document_count()
426
+ if count == 0:
427
+ logger.error("向量库为空! 请先用 -f 指定文件进行入库")
428
+ sys.exit(1)
429
+ print(f" ✅ 已加载: {count} 个文档块")
430
+ elif args.files:
431
+ # 处理文件
432
+ output_dir = Path(args.output_dir) if args.output_dir else config.OCR_OUTPUT_DIR
433
+ manager = run_ingest(
434
+ args.files,
435
+ chunk_size=args.chunk_size,
436
+ chunk_overlap=args.chunk_overlap,
437
+ clear=args.clear,
438
+ output_dir=output_dir,
439
+ )
440
+ else:
441
+ parser.print_help()
442
+ print("\n ❌ 请指定 -f/--files 或 --load")
443
+ sys.exit(1)
444
+
445
+ # 初始化 RAG 链
446
+ print("\n" + "─" * 55)
447
+ print(" 🔗 阶段 2: 初始化 RAG 问答引擎")
448
+ print("─" * 55)
449
+ llm = create_llm()
450
+ chain = RAGChain(
451
+ vector_store_manager=manager,
452
+ llm=llm,
453
+ top_k=args.top_k,
454
+ )
455
+ print(f" ✅ RAG 引擎就绪 (LLM={config.LLM_MODEL_NAME})")
456
+
457
+ # 问答
458
+ if args.question:
459
+ run_qa(chain, args.question, show_sources=args.show_sources)
460
+ else:
461
+ run_repl(chain)
462
+
463
+
464
+ if __name__ == "__main__":
465
+ main()
static/index.html ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="zh-CN">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>OCR RAG — 智能问答系统</title>
7
+ <link rel="preconnect" href="https://fonts.googleapis.com">
8
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
9
+ <link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600&family=Outfit:wght@300;400;500;600;700&display=swap" rel="stylesheet">
10
+ <style>
11
+ :root {
12
+ --bg-root: #07090f;
13
+ --bg-surface: #0c0f17;
14
+ --bg-elevated: #111620;
15
+ --bg-overlay: #181d29;
16
+ --border-default: #1e2533;
17
+ --border-active: #2a3347;
18
+ --text-primary: #e4e7ee;
19
+ --text-secondary: #8b92a3;
20
+ --text-muted: #545b6d;
21
+ --accent-amber: #e8a840;
22
+ --accent-amber-dim: rgba(232,168,64,0.12);
23
+ --accent-amber-glow: rgba(232,168,64,0.25);
24
+ --accent-steel: #7eb8da;
25
+ --accent-steel-dim: rgba(126,184,218,0.1);
26
+ --accent-green: #4db88d;
27
+ --accent-red: #e0556a;
28
+ --radius-sm: 6px;
29
+ --radius-md: 10px;
30
+ --radius-lg: 14px;
31
+ --font-body: 'Outfit', system-ui, -apple-system, sans-serif;
32
+ --font-mono: 'JetBrains Mono', 'SF Mono', monospace;
33
+ --transition-smooth: 0.25s cubic-bezier(0.22,0.61,0.36,1);
34
+ }
35
+
36
+ *,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
37
+ html,body{height:100%;background:var(--bg-root);color:var(--text-primary);font-family:var(--font-body);font-weight:400;line-height:1.6;overflow:hidden}
38
+ body::before{content:'';position:fixed;inset:0;background:radial-gradient(ellipse 60% 50% at 20% 50%, rgba(126,184,218,0.04) 0%,transparent 70%),radial-gradient(ellipse 50% 60% at 85% 40%, rgba(232,168,64,0.03) 0%,transparent 70%);pointer-events:none;z-index:0}
39
+ body::after{content:'';position:fixed;inset:0;background-image:radial-gradient(circle, rgba(255,255,255,0.025) 1px, transparent 1px);background-size:28px 28px;pointer-events:none;z-index:0}
40
+ #app{position:relative;z-index:1;display:flex;height:100vh;width:100vw}
41
+
42
+ /* ── Sidebar ── */
43
+ #sidebar{width:340px;min-width:340px;background:var(--bg-surface);border-right:1px solid var(--border-default);display:flex;flex-direction:column;overflow-y:auto;overflow-x:hidden;z-index:2}
44
+ .sidebar-brand{padding:24px 24px 20px;border-bottom:1px solid var(--border-default)}
45
+ .sidebar-brand .logo{display:flex;align-items:center;gap:10px;text-decoration:none;color:inherit}
46
+ .sidebar-brand .logo-icon{width:34px;height:34px;background:linear-gradient(135deg,var(--accent-amber),#d4952a);border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-size:18px;color:#0a0d14;font-weight:700}
47
+ .sidebar-brand h1{font-family:var(--font-mono);font-size:15px;font-weight:600;letter-spacing:-0.02em;color:var(--text-primary);line-height:1.2}
48
+ .sidebar-brand .subtitle{font-size:11px;color:var(--text-muted);font-family:var(--font-mono);letter-spacing:0.04em;text-transform:uppercase}
49
+ .sidebar-section{padding:20px 24px;border-bottom:1px solid var(--border-default)}
50
+ .sidebar-section-header{display:flex;align-items:center;gap:8px;margin-bottom:14px}
51
+ .sidebar-section-header .dot{width:7px;height:7px;border-radius:50%;background:var(--accent-amber);box-shadow:0 0 6px var(--accent-amber-glow)}
52
+ .sidebar-section-header span{font-family:var(--font-mono);font-size:11px;font-weight:500;letter-spacing:0.06em;text-transform:uppercase;color:var(--text-secondary)}
53
+ .sidebar-section-header .count-badge{font-family:var(--font-mono);font-size:10px;font-weight:600;background:var(--bg-elevated);border:1px solid var(--border-active);padding:2px 8px;border-radius:100px;color:var(--text-secondary);margin-left:auto}
54
+
55
+ .upload-zone{border:2px dashed var(--border-active);border-radius:var(--radius-md);padding:24px 20px;text-align:center;cursor:pointer;transition:all var(--transition-smooth);background:var(--bg-elevated)}
56
+ .upload-zone:hover,.upload-zone.drag-over{border-color:var(--accent-amber);background:var(--accent-amber-dim)}
57
+ .upload-zone .upload-icon{font-size:28px;margin-bottom:8px;opacity:0.7}
58
+ .upload-zone .upload-text{font-size:13px;color:var(--text-secondary);font-weight:500}
59
+ .upload-zone .upload-hint{font-size:11px;color:var(--text-muted);margin-top:4px;font-family:var(--font-mono)}
60
+
61
+ .file-queue{margin-top:10px;max-height:160px;overflow-y:auto}
62
+ .file-queue-item{display:flex;align-items:center;gap:8px;padding:8px 10px;border-radius:var(--radius-sm);font-size:12px;margin-top:4px;background:var(--bg-elevated);border:1px solid var(--border-default);transition:all var(--transition-smooth)}
63
+ .file-queue-item:hover{border-color:var(--border-active)}
64
+ .file-queue-item .fq-icon{font-size:15px;flex-shrink:0}
65
+ .file-queue-item .fq-info{flex:1;min-width:0}
66
+ .file-queue-item .fq-name{font-weight:500;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;font-size:12px}
67
+ .file-queue-item .fq-meta{font-family:var(--font-mono);font-size:10px;color:var(--text-muted)}
68
+ .file-queue-item .fq-remove{width:22px;height:22px;flex-shrink:0;border-radius:50%;border:1px solid var(--border-default);background:transparent;color:var(--text-muted);cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:14px;line-height:1;transition:all var(--transition-smooth)}
69
+ .file-queue-item .fq-remove:hover{border-color:var(--accent-red);color:var(--accent-red);background:rgba(224,85,106,0.08)}
70
+
71
+ .btn{display:inline-flex;align-items:center;justify-content:center;gap:6px;border:none;border-radius:var(--radius-sm);font-family:var(--font-body);font-size:13px;font-weight:500;cursor:pointer;transition:all var(--transition-smooth);padding:10px 18px;white-space:nowrap}
72
+ .btn-primary{width:100%;background:linear-gradient(135deg,var(--accent-amber),#d4952a);color:#0a0d14;font-weight:600;font-size:14px;padding:12px 24px;letter-spacing:0.02em}
73
+ .btn-primary:hover{filter:brightness(1.1);transform:translateY(-1px)}
74
+ .btn-primary:active{transform:translateY(0)}
75
+ .btn-primary:disabled{opacity:0.4;cursor:not-allowed;filter:none;transform:none}
76
+ .btn-sm{padding:6px 14px;font-size:12px;border-radius:var(--radius-sm);width:auto}
77
+
78
+ .status-row{display:flex;align-items:center;gap:8px;padding:10px 14px;border-radius:var(--radius-sm);background:var(--bg-elevated);margin-top:8px;font-size:12px}
79
+ .status-dot{width:8px;height:8px;border-radius:50%;flex-shrink:0}
80
+ .status-dot.idle{background:var(--text-muted)}
81
+ .status-dot.processing{background:var(--accent-amber);animation:pulse 1.2s ease-in-out infinite}
82
+ .status-dot.ready{background:var(--accent-green);box-shadow:0 0 6px rgba(77,184,141,0.4)}
83
+ .status-dot.error{background:var(--accent-red)}
84
+ @keyframes pulse{0%,100%{opacity:1;box-shadow:0 0 4px var(--accent-amber-glow)}50%{opacity:0.4;box-shadow:0 0 12px var(--accent-amber-glow)}}
85
+
86
+ .process-log{margin-top:10px;background:var(--bg-root);border-radius:var(--radius-sm);padding:12px;font-family:var(--font-mono);font-size:11px;color:var(--text-muted);max-height:140px;overflow-y:auto;line-height:1.7;display:none}
87
+ .process-log.visible{display:block}
88
+ .process-log .log-entry{opacity:0;animation:logReveal 0.3s ease forwards}
89
+ .process-log .log-entry:nth-child(1){animation-delay:0.05s}
90
+ .process-log .log-entry:nth-child(2){animation-delay:0.15s}
91
+ .process-log .log-entry:nth-child(3){animation-delay:0.25s}
92
+ .process-log .log-entry:nth-child(4){animation-delay:0.35s}
93
+ .process-log .log-entry:nth-child(5){animation-delay:0.45s}
94
+ @keyframes logReveal{from{opacity:0;transform:translateX(-8px)}to{opacity:1;transform:translateX(0)}}
95
+
96
+ /* ── Sidebar file list ── */
97
+ .sb-pf-item{display:flex;align-items:center;gap:6px;padding:7px 10px;border-radius:var(--radius-sm);margin-top:3px;background:var(--bg-elevated);border:1px solid transparent;cursor:pointer;transition:all var(--transition-smooth)}
98
+ .sb-pf-item:hover{border-color:var(--border-active)}
99
+ .sb-pf-item.selected{border-color:var(--accent-amber);background:var(--accent-amber-dim)}
100
+ .sb-pf-item .sb-icon{font-size:13px;flex-shrink:0;opacity:0.7}
101
+ .sb-pf-item .sb-name{font-size:11px;font-weight:500;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;flex:1}
102
+ .sb-pf-item .sb-meta{font-family:var(--font-mono);font-size:9px;color:var(--text-muted);white-space:nowrap}
103
+ .sb-pf-item .sb-del{width:18px;height:18px;border-radius:50%;border:none;background:transparent;color:var(--text-muted);cursor:pointer;font-size:11px;display:none;align-items:center;justify-content:center;flex-shrink:0;transition:all var(--transition-smooth)}
104
+ .sb-pf-item:hover .sb-del{display:flex}
105
+ .sb-pf-item .sb-del:hover{background:rgba(224,85,106,0.12);color:var(--accent-red)}
106
+ .sb-pf-empty{font-size:11px;color:var(--text-muted);text-align:center;padding:16px 0}
107
+
108
+ /* ── Main ── */
109
+ #main{flex:1;display:flex;flex-direction:column;min-width:0;background:var(--bg-root)}
110
+ .tab-nav{display:flex;gap:0;border-bottom:1px solid var(--border-default);padding:0 20px;background:var(--bg-surface)}
111
+ .tab-btn{padding:14px 22px;background:none;border:none;color:var(--text-muted);font-family:var(--font-mono);font-size:12px;font-weight:500;cursor:pointer;letter-spacing:0.04em;position:relative;transition:color var(--transition-smooth)}
112
+ .tab-btn:hover{color:var(--text-secondary)}
113
+ .tab-btn.active{color:var(--accent-amber)}
114
+ .tab-btn.active::after{content:'';position:absolute;bottom:-1px;left:0;right:0;height:2px;background:var(--accent-amber);box-shadow:0 0 8px var(--accent-amber-glow)}
115
+ .tab-btn .tab-badge{font-size:10px;background:var(--accent-amber-dim);color:var(--accent-amber);padding:1px 7px;border-radius:100px;margin-left:6px}
116
+ .tab-panel{display:none;flex:1;overflow:hidden}
117
+ .tab-panel.active{display:flex;flex-direction:column}
118
+
119
+ /* ── Chat ── */
120
+ #chat-panel{display:none;flex:1;flex-direction:column;overflow:hidden}
121
+ #chat-panel.active{display:flex}
122
+ .chat-messages{flex:1;overflow-y:auto;padding:24px 28px;display:flex;flex-direction:column;gap:18px}
123
+ .chat-empty{flex:1;display:flex;flex-direction:column;align-items:center;justify-content:center;color:var(--text-muted);text-align:center;gap:12px}
124
+ .chat-empty .empty-icon{font-size:48px;opacity:0.3}
125
+ .chat-empty .empty-title{font-size:18px;font-weight:500;color:var(--text-secondary)}
126
+ .chat-empty .empty-desc{font-size:13px;max-width:400px;line-height:1.6}
127
+ .quick-prompts{display:flex;flex-wrap:wrap;gap:8px;justify-content:center;margin-top:8px}
128
+ .quick-prompt{padding:6px 14px;border-radius:100px;font-size:12px;font-weight:500;cursor:pointer;border:1px solid var(--border-active);background:var(--bg-elevated);color:var(--text-secondary);transition:all var(--transition-smooth);white-space:nowrap}
129
+ .quick-prompt:hover{border-color:var(--accent-amber);color:var(--accent-amber);background:var(--accent-amber-dim)}
130
+ .message{display:flex;gap:12px;animation:msgIn 0.35s cubic-bezier(0.22,0.61,0.36,1)}
131
+ @keyframes msgIn{from{opacity:0;transform:translateY(12px)}to{opacity:1;transform:translateY(0)}}
132
+ .message .msg-avatar{width:32px;height:32px;border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-size:15px;flex-shrink:0;font-weight:600}
133
+ .message.user .msg-avatar{background:var(--accent-steel-dim);color:var(--accent-steel);font-family:var(--font-mono);font-size:13px}
134
+ .message.assistant .msg-avatar{background:var(--accent-amber-dim);color:var(--accent-amber)}
135
+ .message .msg-bubble{max-width:75%;padding:12px 16px;border-radius:var(--radius-md);font-size:14px;line-height:1.65}
136
+ .message.user .msg-bubble{background:var(--bg-overlay);color:var(--text-primary);border:1px solid var(--border-default)}
137
+ .message.assistant .msg-bubble{background:var(--bg-elevated);color:var(--text-primary);border:1px solid var(--border-active)}
138
+ .chat-input-area{padding:16px 24px 20px;border-top:1px solid var(--border-default);background:var(--bg-surface)}
139
+ .chat-input-row{display:flex;gap:10px;align-items:flex-end}
140
+ .chat-input-row textarea{flex:1;background:var(--bg-root);border:1px solid var(--border-active);border-radius:var(--radius-md);color:var(--text-primary);font-family:var(--font-body);font-size:14px;padding:12px 16px;resize:none;outline:none;min-height:46px;max-height:120px;line-height:1.5;transition:border-color var(--transition-smooth)}
141
+ .chat-input-row textarea:focus{border-color:var(--accent-amber);box-shadow:0 0 0 3px var(--accent-amber-dim)}
142
+ .chat-input-row textarea::placeholder{color:var(--text-muted)}
143
+ .chat-input-row .btn-send{width:46px;height:46px;border-radius:var(--radius-md);background:var(--accent-amber);border:none;color:#0a0d14;font-size:18px;cursor:pointer;transition:all var(--transition-smooth);display:flex;align-items:center;justify-content:center;flex-shrink:0}
144
+ .chat-input-row .btn-send:hover{filter:brightness(1.1)}
145
+ .chat-input-row .btn-send:disabled{opacity:0.3;cursor:not-allowed;filter:none}
146
+
147
+ /* ── Documents tab (merged Preview + Documents) ── */
148
+ #preview-panel{display:flex;flex-direction:column;overflow:hidden;flex:1}
149
+
150
+ /* File cards row */
151
+ .doc-cards-wrap{padding:16px 24px;border-bottom:1px solid var(--border-default);overflow-x:auto;flex-shrink:0}
152
+ .doc-cards-wrap h3{font-family:var(--font-mono);font-size:11px;font-weight:500;letter-spacing:0.06em;color:var(--text-muted);text-transform:uppercase;margin-bottom:12px;display:flex;align-items:center;gap:8px}
153
+ .doc-cards-wrap h3::before{content:'';width:7px;height:7px;border-radius:50%;background:var(--accent-amber);box-shadow:0 0 6px var(--accent-amber-glow)}
154
+ .doc-cards{display:flex;gap:12px;padding-bottom:4px}
155
+ .doc-card{flex:0 0 auto;width:220px;background:var(--bg-elevated);border:2px solid var(--border-default);border-radius:var(--radius-md);padding:16px;cursor:pointer;transition:all var(--transition-smooth);position:relative}
156
+ .doc-card:hover{border-color:var(--border-active);background:var(--bg-overlay)}
157
+ .doc-card.selected{border-color:var(--accent-amber);box-shadow:0 0 12px var(--accent-amber-dim);background:var(--bg-overlay)}
158
+ .doc-card .dc-icon{width:38px;height:38px;border-radius:var(--radius-sm);display:flex;align-items:center;justify-content:center;font-family:var(--font-mono);font-size:12px;font-weight:700;margin-bottom:10px}
159
+ .doc-card .dc-icon.pdf{background:rgba(224,85,106,0.12);color:var(--accent-red)}
160
+ .doc-card .dc-icon.img{background:var(--accent-steel-dim);color:var(--accent-steel)}
161
+ .doc-card .dc-name{font-weight:600;font-size:13px;color:var(--text-primary);white-space:nowrap;overflow:hidden;text-overflow:ellipsis;margin-bottom:8px;line-height:1.3}
162
+ .doc-card .dc-stats{display:flex;gap:10px;font-family:var(--font-mono);font-size:10px;color:var(--text-muted)}
163
+ .doc-card .dc-stats span{display:flex;align-items:center;gap:2px}
164
+ .doc-card .dc-stats .dc-val{color:var(--text-secondary)}
165
+ .doc-card .dc-delete{position:absolute;top:8px;right:8px;width:22px;height:22px;border-radius:50%;border:1px solid transparent;background:transparent;color:var(--text-muted);cursor:pointer;display:flex;align-items:center;justify-content:center;font-size:12px;transition:all var(--transition-smooth);opacity:0}
166
+ .doc-card:hover .dc-delete{opacity:1;border-color:var(--border-default)}
167
+ .doc-card .dc-delete:hover{border-color:var(--accent-red);color:var(--accent-red);background:rgba(224,85,106,0.08)}
168
+
169
+ /* Preview pane */
170
+ .doc-preview-wrap{flex:1;display:flex;flex-direction:column;overflow:hidden}
171
+ .doc-preview-header{display:flex;align-items:center;gap:12px;padding:12px 24px;background:var(--bg-surface);border-bottom:1px solid var(--border-default);flex-shrink:0}
172
+ .doc-preview-header .dph-title{font-weight:600;font-size:14px;color:var(--text-primary)}
173
+ .doc-preview-header .dph-meta{font-family:var(--font-mono);font-size:11px;color:var(--text-muted);margin-left:auto}
174
+ .doc-preview-body{flex:1;overflow-y:auto;padding:24px 28px;font-family:var(--font-mono);font-size:13px;line-height:1.8;color:var(--text-secondary);white-space:pre-wrap;word-break:break-word}
175
+ .doc-preview-empty{flex:1;display:flex;align-items:center;justify-content:center;text-align:center;color:var(--text-muted)}
176
+ .doc-preview-empty .dpe-icon{font-size:48px;opacity:0.15;margin-bottom:10px}
177
+ .doc-preview-empty .dpe-text{font-size:14px}
178
+ .pv-page-header{color:var(--accent-amber);font-weight:600;font-size:12px;letter-spacing:0.04em;padding:8px 0;border-bottom:1px solid var(--border-default);margin:16px 0 12px}
179
+ .pv-page-header:first-child{margin-top:0}
180
+
181
+ .doc-empty-state{display:flex;flex-direction:column;align-items:center;justify-content:center;height:100%;text-align:center;color:var(--text-muted)}
182
+ .doc-empty-state .de-icon{font-size:56px;opacity:0.12;margin-bottom:12px}
183
+ .doc-empty-state .de-title{font-size:16px;color:var(--text-secondary);font-weight:500}
184
+ .doc-empty-state .de-desc{font-size:13px;margin-top:6px;max-width:360px}
185
+
186
+ /* ── Split preview ── */
187
+ .pv-split-wrap{flex:1;display:flex;flex-direction:column;overflow:hidden}
188
+ .pv-split-header{display:flex;align-items:center;gap:12px;padding:10px 24px;background:var(--bg-surface);border-bottom:1px solid var(--border-default);flex-shrink:0}
189
+ .pv-split-body{flex:1;display:flex;overflow:hidden}
190
+ .pv-left{flex:1;overflow:auto;background:var(--bg-root);display:flex;align-items:center;justify-content:center;min-width:0}
191
+ .pv-left iframe{width:100%;height:100%;border:none;background:#fff}
192
+ .pv-left img{max-width:100%;max-height:100%;object-fit:contain}
193
+ .pv-left-placeholder{color:var(--text-muted);font-size:13px;text-align:center}
194
+ .pv-divider{width:4px;background:var(--border-default);flex-shrink:0;cursor:col-resize;transition:background var(--transition-smooth)}
195
+ .pv-divider:hover{background:var(--accent-amber)}
196
+ .pv-right{flex:1;overflow-y:auto;padding:20px 24px;font-family:var(--font-mono);font-size:13px;line-height:1.8;color:var(--text-secondary);white-space:pre-wrap;word-break:break-word;min-width:0}
197
+ .pv-right-placeholder{text-align:center;color:var(--text-muted);font-size:13px;padding-top:60px}
198
+
199
+ /* ── Status ── */
200
+ #status-panel{padding:24px 28px;overflow-y:auto;flex:1}
201
+ .status-card{background:var(--bg-elevated);border:1px solid var(--border-default);border-radius:var(--radius-md);padding:20px;margin-bottom:16px}
202
+ .status-card h3{font-family:var(--font-mono);font-size:12px;font-weight:500;letter-spacing:0.06em;color:var(--text-muted);text-transform:uppercase;margin-bottom:14px;display:flex;align-items:center;gap:8px}
203
+ .status-card h3::before{content:'';width:8px;height:8px;border-radius:2px;background:var(--accent-steel)}
204
+ .status-card h3 .sc-btn{margin-left:auto;cursor:pointer}
205
+ .status-card h3 .sc-btn::before{content:none}
206
+ .model-stack{display:grid;grid-template-columns:1fr 1fr;gap:10px}
207
+ .model-item{background:var(--bg-surface);border-radius:var(--radius-sm);padding:12px 14px;border:1px solid var(--border-default)}
208
+ .model-item .model-label{font-size:10px;font-family:var(--font-mono);color:var(--text-muted);text-transform:uppercase;letter-spacing:0.04em;margin-bottom:4px}
209
+ .model-item .model-value{font-size:13px;font-weight:600;color:var(--text-primary)}
210
+ .model-item .model-sub{font-family:var(--font-mono);font-size:10px;color:var(--text-muted);margin-top:2px;word-break:break-all}
211
+ .param-row{display:flex;justify-content:space-between;align-items:center;padding:8px 0;border-bottom:1px solid rgba(255,255,255,0.03);font-size:13px}
212
+ .param-row:last-child{border-bottom:none}
213
+ .param-row .param-label{color:var(--text-secondary)}
214
+ .param-row .param-value{font-family:var(--font-mono);font-size:12px;color:var(--accent-steel);font-weight:500}
215
+ .param-edit{display:flex;align-items:center;gap:8px}
216
+ .param-edit input[type="range"]{-webkit-appearance:none;width:120px;height:4px;border-radius:2px;background:var(--border-active);outline:none}
217
+ .param-edit input[type="range"]::-webkit-slider-thumb{-webkit-appearance:none;width:14px;height:14px;border-radius:50%;background:var(--accent-amber);cursor:pointer;border:2px solid var(--bg-root)}
218
+ .param-edit .pe-val{font-family:var(--font-mono);font-size:12px;color:var(--accent-amber);min-width:45px;text-align:right}
219
+ .param-edit input[type="number"]{width:70px;padding:6px 8px;background:var(--bg-root);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-primary);font-family:var(--font-mono);font-size:12px;outline:none;text-align:center}
220
+ .param-edit input[type="number"]:focus{border-color:var(--accent-amber)}
221
+ .config-section{margin-bottom:16px;padding-bottom:12px;border-bottom:1px solid var(--border-default)}
222
+ .config-section:last-child{border-bottom:none;margin-bottom:0}
223
+ .config-section h4{font-size:13px;font-weight:600;color:var(--text-secondary);margin-bottom:8px}
224
+ .form-group{margin-bottom:8px}
225
+ .form-group label{display:block;font-size:11px;font-weight:500;color:var(--text-muted);margin-bottom:4px;text-transform:uppercase;letter-spacing:0.05em}
226
+ .form-group input,.form-group select{width:100%;padding:8px 10px;background:var(--bg-root);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-primary);font-family:var(--font-mono);font-size:12px;outline:none;transition:var(--transition-smooth)}
227
+ .form-group input:focus,.form-group select:focus{border-color:var(--accent-amber);box-shadow:0 0 0 2px var(--accent-amber-dim)}
228
+ .form-group .input-row{display:flex;gap:6px}
229
+ .form-group .input-row input{flex:1}
230
+ .form-group .toggle-vis{width:34px;flex-shrink:0;background:var(--bg-elevated);border:1px solid var(--border-default);border-radius:var(--radius-sm);color:var(--text-muted);cursor:pointer;font-family:var(--font-mono);font-size:11px;display:flex;align-items:center;justify-content:center;transition:var(--transition-smooth)}
231
+ .form-group .toggle-vis:hover{border-color:var(--border-active);color:var(--text-secondary)}
232
+ .config-readonly{font-family:var(--font-mono);font-size:11px;color:var(--text-muted);padding:4px 0}
233
+ .sources-toggle{padding:8px 24px;font-size:11px;font-family:var(--font-mono);color:var(--text-muted);cursor:pointer;user-select:none;border-top:1px solid var(--border-default);background:var(--bg-surface);display:flex;align-items:center;gap:6px;transition:color var(--transition-smooth)}
234
+ .sources-toggle:hover{color:var(--text-secondary)}
235
+ .sources-content{background:var(--bg-elevated);border-top:1px solid var(--border-default);max-height:200px;overflow-y:auto;display:none;padding:12px 24px}
236
+ .sources-content.open{display:block}
237
+ .source-item{padding:8px 12px;border-left:2px solid var(--accent-steel);margin-bottom:8px;font-size:12px;background:var(--bg-surface);border-radius:0 var(--radius-sm) var(--radius-sm) 0}
238
+ .source-item .src-header{display:flex;gap:12px;font-family:var(--font-mono);font-size:10px;color:var(--accent-amber);margin-bottom:4px}
239
+ .source-item .src-excerpt{color:var(--text-secondary);font-size:12px;line-height:1.5}
240
+ .typing-indicator{display:flex;gap:5px;padding:4px 0}
241
+ .typing-indicator span{width:6px;height:6px;border-radius:50%;background:var(--text-muted);animation:typingBounce 1.2s ease-in-out infinite}
242
+ .typing-indicator span:nth-child(2){animation-delay:0.15s}
243
+ .typing-indicator span:nth-child(3){animation-delay:0.3s}
244
+ @keyframes typingBounce{0%,60%,100%{transform:translateY(0);opacity:0.4}30%{transform:translateY(-6px);opacity:1}}
245
+ ::-webkit-scrollbar{width:5px}
246
+ ::-webkit-scrollbar-track{background:transparent}
247
+ ::-webkit-scrollbar-thumb{background:var(--border-active);border-radius:3px}
248
+ ::-webkit-scrollbar-thumb:hover{background:var(--text-muted)}
249
+ .toast{position:fixed;bottom:24px;right:24px;background:var(--bg-overlay);border:1px solid var(--border-active);border-radius:var(--radius-md);padding:14px 20px;font-size:13px;z-index:100;opacity:0;transform:translateY(12px);transition:all 0.3s ease;pointer-events:none;max-width:360px}
250
+ .toast.show{opacity:1;transform:translateY(0)}
251
+ .toast.error{border-color:var(--accent-red)}
252
+ .toast.success{border-color:var(--accent-green)}
253
+ .progress-bar-wrap{height:3px;background:var(--border-default);border-radius:2px;margin-top:10px;overflow:hidden;display:none}
254
+ .progress-bar-wrap.active{display:block}
255
+ .progress-bar-fill{height:100%;background:linear-gradient(90deg,var(--accent-amber),#d4952a);border-radius:2px;width:0%;transition:width 0.3s ease}
256
+ @media(max-width:860px){
257
+ #app{flex-direction:column}
258
+ #sidebar{width:100%;min-width:100%;max-height:40vh;border-right:none;border-bottom:1px solid var(--border-default)}
259
+ .doc-card{width:180px}
260
+ .model-stack{grid-template-columns:1fr}
261
+ }
262
+ </style>
263
+ </head>
264
+ <body>
265
+
266
+ <div id="app">
267
+ <!-- ═══ SIDEBAR ═══ -->
268
+ <aside id="sidebar">
269
+ <div class="sidebar-brand">
270
+ <a class="logo" href="/"><div class="logo-icon">◈</div><div><h1>OCR RAG</h1><div class="subtitle">Intelligent Q&A System</div></div></a>
271
+ </div>
272
+ <div class="sidebar-section">
273
+ <div class="sidebar-section-header"><div class="dot"></div><span>Document Upload</span><span class="count-badge" id="queueCount">0</span></div>
274
+ <div class="upload-zone" id="uploadZone">
275
+ <div class="upload-icon">↓</div>
276
+ <div class="upload-text">Drop files here or click to browse</div>
277
+ <div class="upload-hint">PDF · PNG · JPG · BMP · TIF</div>
278
+ </div>
279
+ <input type="file" id="fileInput" accept=".pdf,.png,.jpg,.jpeg,.bmp,.tif,.tiff" multiple hidden>
280
+ <div class="file-queue" id="fileQueue"></div>
281
+ <button class="btn btn-primary" id="processBtn" disabled>→ Process Documents</button>
282
+ <div class="progress-bar-wrap" id="progressWrap"><div class="progress-bar-fill" id="progressFill"></div></div>
283
+ <div class="status-row">
284
+ <div class="status-dot idle" id="statusDot"></div>
285
+ <span id="statusText">Ready — upload files to begin</span>
286
+ </div>
287
+ <div class="process-log" id="processLog"></div>
288
+ </div>
289
+ <div class="sidebar-section" style="flex:1;overflow-y:auto" id="sidebarFileSection">
290
+ <div class="sidebar-section-header"><div class="dot"></div><span>Processed Files</span><span class="count-badge" id="sidebarPfCount">0</span></div>
291
+ <div id="sidebarFileList"><div class="sb-pf-empty">No files processed yet</div></div>
292
+ </div>
293
+ </aside>
294
+
295
+ <!-- ═══ MAIN ═══ -->
296
+ <main id="main">
297
+ <nav class="tab-nav">
298
+ <button class="tab-btn active" data-tab="chat">Chat</button>
299
+ <button class="tab-btn" data-tab="preview">Preview <span class="tab-badge" id="docTabBadge">0</span></button>
300
+ <button class="tab-btn" data-tab="status">System Status</button>
301
+ </nav>
302
+
303
+ <!-- Chat -->
304
+ <div class="tab-panel active" id="tab-chat">
305
+ <div id="chat-panel" class="active">
306
+ <div class="chat-messages" id="chatMessages">
307
+ <div class="chat-empty" id="chatEmpty">
308
+ <div class="empty-icon">◈</div>
309
+ <div class="empty-title">Ask questions about your documents</div>
310
+ <div class="empty-desc">Upload and process documents first, then ask questions. The AI will search through the documents to find relevant answers.</div>
311
+ <div class="quick-prompts">
312
+ <span class="quick-prompt" data-question="请对这份文档进行详细摘要,列出各章节的主要内容">Summary</span>
313
+ <span class="quick-prompt" data-question="文档中提到了哪些关键数据和重要信息?请分点列出">Key Data</span>
314
+ <span class="quick-prompt" data-question="文档中的表格包含了什么内容?请整理说明">Tables</span>
315
+ <span class="quick-prompt" data-question="文档的核心观点和结论是什么?">Core Ideas</span>
316
+ </div>
317
+ </div>
318
+ </div>
319
+ <div class="sources-toggle" id="sourcesToggle" style="display:none">↓ Sources &amp; References</div>
320
+ <div class="sources-content" id="sourcesContent"></div>
321
+ <div class="chat-input-area">
322
+ <div class="chat-input-row">
323
+ <textarea id="questionInput" placeholder="Ask a question about the documents..." rows="1"></textarea>
324
+ <button class="btn-send" id="sendBtn" disabled>↑</button>
325
+ </div>
326
+ </div>
327
+ </div>
328
+ </div>
329
+
330
+ <!-- Preview -->
331
+ <div class="tab-panel" id="tab-preview">
332
+ <div id="preview-panel">
333
+ <!-- Top: file card row -->
334
+ <div class="doc-cards-wrap" id="docCardsWrap">
335
+ <h3>Processed Files</h3>
336
+ <div class="doc-cards" id="docCards"></div>
337
+ </div>
338
+ <div class="doc-empty-state" id="docEmptyState">
339
+ <div class="de-icon">◈</div>
340
+ <div class="de-title">No documents processed yet</div>
341
+ <div class="de-desc">Upload and process files from the sidebar — they will appear here. Click on a file card to preview the original file and OCR result side by side.</div>
342
+ </div>
343
+ <!-- Split preview pane -->
344
+ <div class="pv-split-wrap" id="pvSplitWrap" style="display:none">
345
+ <div class="pv-split-header">
346
+ <span class="dph-title" id="dphTitle">—</span>
347
+ <span class="dph-meta" id="dphMeta"></span>
348
+ </div>
349
+ <div class="pv-split-body">
350
+ <div class="pv-left" id="pvOriginal">
351
+ <div class="pv-left-placeholder">Select a file to preview</div>
352
+ </div>
353
+ <div class="pv-divider"></div>
354
+ <div class="pv-right" id="pvOcrText">
355
+ <div class="pv-right-placeholder">OCR result will appear here</div>
356
+ </div>
357
+ </div>
358
+ </div>
359
+ </div>
360
+ </div>
361
+
362
+ <!-- Status -->
363
+ <div class="tab-panel" id="tab-status">
364
+ <div id="status-panel">
365
+ <div class="status-card"><h3>Model Stack</h3>
366
+ <div class="model-stack">
367
+ <div class="model-item"><div class="model-label">OCR Engine</div><div class="model-value" id="sOCRModel">—</div><div class="model-sub" id="sOCRBase">—</div></div>
368
+ <div class="model-item"><div class="model-label">Embedding</div><div class="model-value" id="sEmbedModel">—</div><div class="model-sub" id="sEmbedBase">—</div></div>
369
+ <div class="model-item"><div class="model-label">LLM</div><div class="model-value" id="sLLMModel">—</div><div class="model-sub" id="sLLMBase">—</div></div>
370
+ <div class="model-item"><div class="model-label">Vector DB</div><div class="model-value" id="sVectorDB">—</div></div>
371
+ </div>
372
+ </div>
373
+ <div class="status-card"><h3>Processing Parameters</h3>
374
+ <div class="param-row">
375
+ <span class="param-label">Chunk Size</span>
376
+ <div class="param-edit"><input type="range" id="sChunkSize" min="200" max="2000" value="800" step="50"><span class="pe-val" id="sChunkSizeVal">800</span><span style="font-size:10px;color:var(--text-muted)">chars</span></div>
377
+ </div>
378
+ <div class="param-row">
379
+ <span class="param-label">Overlap</span>
380
+ <div class="param-edit"><input type="range" id="sChunkOverlap" min="0" max="500" value="150" step="25"><span class="pe-val" id="sChunkOverlapVal">150</span><span style="font-size:10px;color:var(--text-muted)">chars</span></div>
381
+ </div>
382
+ <div class="param-row" style="margin-top:10px">
383
+ <span class="param-label">Retrieval Top-K</span>
384
+ <div class="param-edit"><input type="number" id="sRetrievalK" min="1" max="20" value="5"></div>
385
+ </div>
386
+ <button class="btn btn-primary btn-sm" onclick="saveProcessingParams()" style="margin-top:14px">Save Parameters</button>
387
+ <span id="paramsMsg" style="font-size:12px;margin-left:10px"></span>
388
+ </div>
389
+ <div class="status-card" id="configCard">
390
+ <h3><span>API Configuration</span><button class="btn btn-primary btn-sm sc-btn" id="editConfigBtn">Edit</button></h3>
391
+ <div id="configDisplay"><div id="configContent"></div></div>
392
+ <div id="configEdit" style="display:none">
393
+ <div class="config-section"><h4>OCR API</h4>
394
+ <div class="form-group"><label>Engine</label><select id="cfgOcrEngine"><option value="paddle">paddle (local)</option><option value="api">api (remote)</option></select></div>
395
+ <div class="form-group"><label>API Base URL</label><input id="cfgOcrBase" placeholder="http://127.0.0.1:8002/v1"></div>
396
+ <div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgOcrKey" type="password" placeholder="not-needed"><button class="toggle-vis" onclick="togglePassword('cfgOcrKey',this)">👁</button></div></div>
397
+ <div class="form-group"><label>Model Name</label><input id="cfgOcrModel" placeholder="PaddleOCR-VL-1.5"></div>
398
+ </div>
399
+ <div class="config-section"><h4>Embedding API</h4>
400
+ <div class="form-group"><label>API Base URL</label><input id="cfgEmbedBase" placeholder="https://dashscope.aliyuncs.com/compatible-mode/v1"></div>
401
+ <div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgEmbedKey" type="password" placeholder="sk-..."><button class="toggle-vis" onclick="togglePassword('cfgEmbedKey',this)">👁</button></div></div>
402
+ <div class="form-group"><label>Model Name</label><input id="cfgEmbedModel" placeholder="text-embedding-v4"></div>
403
+ </div>
404
+ <div class="config-section"><h4>LLM API</h4>
405
+ <div class="form-group"><label>API Base URL</label><input id="cfgLLMBase" placeholder="http://0.0.0.0:8013/v1"></div>
406
+ <div class="form-group"><label>API Key</label><div class="input-row"><input id="cfgLLMKey" type="password" placeholder="not-needed"><button class="toggle-vis" onclick="togglePassword('cfgLLMKey',this)">👁</button></div></div>
407
+ <div class="form-group"><label>Model Name</label><input id="cfgLLMModel" placeholder="Qwen/Qwen3-4B-Instruct-2507"></div>
408
+ </div>
409
+ <div style="display:flex;gap:8px;margin-top:12px"><button class="btn btn-primary btn-sm" onclick="saveConfig()">Save</button><button class="btn btn-sm" onclick="cancelConfigEdit()">Cancel</button></div>
410
+ <div id="configMsg" style="margin-top:8px;font-size:13px"></div>
411
+ </div>
412
+ </div>
413
+ <div class="status-card"><h3>Database</h3><div id="sDBStats">No documents indexed</div></div>
414
+ </div>
415
+ </div>
416
+ </main>
417
+ </div>
418
+
419
+ <div class="toast" id="toast"></div>
420
+
421
+ <script>
422
+ const $=s=>document.querySelector(s);
423
+ const $$=s=>document.querySelectorAll(s);
424
+ const state={files:[],ready:false,processing:false,fileList:[],selectedDoc:-1};
425
+
426
+ const D={
427
+ uploadZone:$('#uploadZone'),fileInput:$('#fileInput'),fileQueue:$('#fileQueue'),
428
+ queueCount:$('#queueCount'),processBtn:$('#processBtn'),
429
+ progressWrap:$('#progressWrap'),progressFill:$('#progressFill'),
430
+ statusDot:$('#statusDot'),statusText:$('#statusText'),processLog:$('#processLog'),
431
+ chatMessages:$('#chatMessages'),chatEmpty:$('#chatEmpty'),
432
+ questionInput:$('#questionInput'),sendBtn:$('#sendBtn'),
433
+ sToggle:$('#sourcesToggle'),sContent:$('#sourcesContent'),
434
+ docCards:$('#docCards'),docCardsWrap:$('#docCardsWrap'),docEmptyState:$('#docEmptyState'),
435
+ pvSplitWrap:$('#pvSplitWrap'),pvOriginal:$('#pvOriginal'),pvOcrText:$('#pvOcrText'),
436
+ dphTitle:$('#dphTitle'),dphMeta:$('#dphMeta'),docTabBadge:$('#docTabBadge'),
437
+ toast:$('#toast'),
438
+ };
439
+
440
+ // ─── Toast ──────────────────────────────────────────────
441
+ let tt;function showToast(m,t){clearTimeout(tt);D.toast.textContent=m;D.toast.className='toast '+t+' show';tt=setTimeout(()=>D.toast.classList.remove('show'),3500)}
442
+
443
+ // ─── Log / Progress ��────────────────────────────────────
444
+ function addLog(m){D.processLog.classList.add('visible');const d=document.createElement('div');d.className='log-entry';d.textContent='> '+m;D.processLog.appendChild(d);D.processLog.scrollTop=D.processLog.scrollHeight}
445
+ function clearLog(){D.processLog.innerHTML='';D.processLog.classList.remove('visible')}
446
+ function setProgress(p){D.progressFill.style.width=p+'%'}
447
+ function showProgress(s){D.progressWrap.classList.toggle('active',s)}
448
+ function setStatus(st,t){D.statusDot.className='status-dot '+st;D.statusText.textContent=t}
449
+
450
+ // ─── File queue ─────────────────────────────────────────
451
+ const VE=['.pdf','.png','.jpg','.jpeg','.bmp','.tif','.tiff'];
452
+ function renderQueue(){
453
+ if(!state.files.length){D.fileQueue.innerHTML='';D.processBtn.disabled=true;D.processBtn.textContent='→ Process Documents';D.queueCount.textContent='0';setStatus('idle','Ready — upload files to begin');return}
454
+ D.fileQueue.innerHTML=state.files.map((f,i)=>`<div class="file-queue-item"><span class="fq-icon">${f.name.toLowerCase().endsWith('.pdf')?'📄':'🖼️'}</span><div class="fq-info"><div class="fq-name" title="${f.name}">${f.name}</div><div class="fq-meta">${(f.size/1024/1024).toFixed(1)} MB</div></div><button class="fq-remove" onclick="removeFile(${i})" title="Remove">×</button></div>`).join('');
455
+ D.processBtn.disabled=false;D.processBtn.textContent=`→ Process ${state.files.length} File${state.files.length>1?'s':''}`;
456
+ D.queueCount.textContent=state.files.length;setStatus('idle',`${state.files.length} file${state.files.length>1?'s':''} queued`);
457
+ }
458
+ function addFiles(a){for(const f of a){const e='.'+f.name.split('.').pop().toLowerCase();if(!VE.includes(e)){showToast('Skipped: '+f.name,'error');continue}if(state.files.some(x=>x.name===f.name&&x.size===f.size))continue;state.files.push(f)}renderQueue()}
459
+ function removeFile(i){state.files.splice(i,1);renderQueue()}
460
+
461
+ // ─── Upload events ──────────────────────────────────────
462
+ D.uploadZone.addEventListener('click',()=>D.fileInput.click());
463
+ D.uploadZone.addEventListener('dragover',e=>{e.preventDefault();D.uploadZone.classList.add('drag-over')});
464
+ D.uploadZone.addEventListener('dragleave',()=>D.uploadZone.classList.remove('drag-over'));
465
+ D.uploadZone.addEventListener('drop',e=>{e.preventDefault();D.uploadZone.classList.remove('drag-over');if(e.dataTransfer.files.length)addFiles(e.dataTransfer.files)});
466
+ D.fileInput.addEventListener('change',()=>{if(D.fileInput.files.length){addFiles(D.fileInput.files);D.fileInput.value=''}});
467
+
468
+ // ─── Process ────────────────────────────────────────────
469
+ D.processBtn.addEventListener('click',async()=>{
470
+ if(!state.files.length||state.processing)return;
471
+ state.processing=true;D.processBtn.disabled=true;D.processBtn.textContent='Processing...';
472
+ clearLog();showProgress(true);setProgress(5);setStatus('processing','Processing documents...');
473
+ addLog(`Processing ${state.files.length} file(s)...`);
474
+ const fd=new FormData();state.files.forEach(f=>fd.append('files',f));
475
+ fd.append('chunk_size',$('#sChunkSize').value);fd.append('chunk_overlap',$('#sChunkOverlap').value);
476
+ const sim=setInterval(()=>{const w=parseFloat(D.progressFill.style.width)||5;setProgress(Math.min(w+(100-w)*0.15,92))},400);
477
+ try{
478
+ const r=await fetch('/api/upload',{method:'POST',body:fd});clearInterval(sim);setProgress(100);
479
+ if(!r.ok){const e=await r.json();throw new Error(e.detail||'Upload failed')}
480
+ const d=await r.json();state.ready=true;
481
+ if(d.results&&d.results.length){d.results.forEach(r=>addLog(`✓ ${r.name}: ${r.pages}p · ${r.chunks}c`));setStatus('ready',`${d.total} file(s) processed · Ready for Q&A`);showToast(d.total+' document(s) processed!','success')}
482
+ if(d.errors&&d.errors.length)d.errors.forEach(e=>addLog(`ERROR: ${e}`));
483
+ refreshFileList();refreshStatus();D.chatEmpty.style.display='none';D.sendBtn.disabled=false;D.questionInput.focus();
484
+ state.files=[];renderQueue();
485
+ }catch(e){clearInterval(sim);setProgress(0);setStatus('error','Processing failed');addLog(`ERROR: ${e.message}`);showToast(e.message,'error')}
486
+ finally{showProgress(false);state.processing=false;D.processBtn.textContent='→ Process Documents';D.processBtn.disabled=state.files.length===0}
487
+ });
488
+
489
+ // ─── Escape HTML ────────────────────────────────────────
490
+ function esc(s){const d=document.createElement('div');d.textContent=s;return d.innerHTML}
491
+
492
+ // ─── Documents tab — load preview ───────────────────────
493
+ async function loadPreview(idx){
494
+ state.selectedDoc=idx;
495
+ const f=state.fileList[idx];if(!f)return;
496
+ // Highlight card & sidebar item
497
+ $$('.doc-card').forEach((c,i)=>c.classList.toggle('selected',i===idx));
498
+ $$('.sb-pf-item').forEach((c,i)=>c.classList.toggle('selected',i===idx));
499
+ // Show split view
500
+ D.pvSplitWrap.style.display='flex';D.docEmptyState.style.display='none';
501
+ D.dphTitle.textContent=f.name;D.dphMeta.textContent='';
502
+
503
+ // Left: original file
504
+ const isPdf=f.format==='.pdf';
505
+ D.pvOriginal.innerHTML=isPdf
506
+ ? `<iframe src="/api/file/${idx}"></iframe>`
507
+ : `<img src="/api/file/${idx}" alt="${f.name}">`;
508
+
509
+ // Right: loading
510
+ D.pvOcrText.innerHTML='<div style="text-align:center;color:var(--text-muted);padding:60px">Loading OCR result...</div>';
511
+
512
+ // Fetch OCR
513
+ try{
514
+ const r=await fetch('/api/preview/'+idx);
515
+ if(!r.ok){D.pvOcrText.innerHTML=`<div style="text-align:center;color:var(--accent-red);padding:60px">OCR text not available (${r.status})</div>`;return}
516
+ const d=await r.json();
517
+ if(d.success&&d.text){
518
+ const parts=d.text.split(/--- 第 (\d+) 页 ---/g);let h='';
519
+ for(let i=1;i<parts.length;i+=2)h+=`<div class="pv-page-header">Page ${parts[i]}</div><div>${esc(parts[i+1]||'')}</div>`;
520
+ if(!h&&d.text)h=`<div>${esc(d.text)}</div>`;
521
+ D.pvOcrText.innerHTML=h||'<div style="text-align:center;color:var(--text-muted);padding:60px">No text content</div>';
522
+ const pg=parts.length>>1;
523
+ D.dphMeta.textContent=`${pg} pages · ${d.text.length} chars`;
524
+ }else{D.pvOcrText.innerHTML='<div style="text-align:center;color:var(--text-muted);padding:60px">OCR result is empty</div>'}
525
+ }catch(e){D.pvOcrText.innerHTML=`<div style="text-align:center;color:var(--accent-red);padding:60px">Failed to load: ${e.message}</div>`}
526
+ }
527
+
528
+ // ─── Documents tab — file list ──────────────────────────
529
+ async function refreshFileList(){
530
+ try{const r=await fetch('/api/status');const d=await r.json();state.fileList=d.files||[];
531
+ // Sidebar list
532
+ const sfl=$('#sidebarFileList');
533
+ const spc=$('#sidebarPfCount');
534
+ if(state.fileList.length){
535
+ sfl.innerHTML=state.fileList.map((f,i)=>`<div class="sb-pf-item${i===state.selectedDoc?' selected':''}" onclick="switchTab('preview');loadPreview(${i})"><span class="sb-icon">${f.format==='.pdf'?'📄':'🖼️'}</span><span class="sb-name" title="${f.name}">${f.name}</span><span class="sb-meta">${f.pages}p·${f.chunks}c</span><button class="sb-del" onclick="event.stopPropagation();deleteDoc(${i})" title="Remove">×</button></div>`).join('');
536
+ spc.textContent=state.fileList.length;
537
+ }else{
538
+ sfl.innerHTML='<div class="sb-pf-empty">No files processed yet</div>';
539
+ spc.textContent='0';
540
+ }
541
+ // Documents tab
542
+ if(state.fileList.length){
543
+ D.docCardsWrap.style.display='';D.docEmptyState.style.display='none';
544
+ D.docCards.innerHTML=state.fileList.map((f,i)=>`<div class="doc-card${i===state.selectedDoc?' selected':''}" onclick="loadPreview(${i})"><div class="dc-icon ${f.format==='.pdf'?'pdf':'img'}">${f.format==='.pdf'?'PDF':'IMG'}</div><div class="dc-name" title="${f.name}">${f.name}</div><div class="dc-stats"><span>📄 <span class="dc-val">${f.pages}</span>p</span><span>🧩 <span class="dc-val">${f.chunks}</span>c</span><span>💾 <span class="dc-val">${f.size_mb}</span>MB</span></div><button class="dc-delete" onclick="event.stopPropagation();deleteDoc(${i})" title="Remove">✕</button><div style="font-family:var(--font-mono);font-size:9px;color:var(--text-muted);margin-top:6px">${f.time}</div></div>`).join('');
545
+ if(state.selectedDoc>=0&&state.selectedDoc<state.fileList.length)D.pvSplitWrap.style.display='flex';
546
+ }else{
547
+ D.docCardsWrap.style.display='none';D.docCards.innerHTML='';D.docEmptyState.style.display='flex';
548
+ D.pvSplitWrap.style.display='none';state.selectedDoc=-1;
549
+ }
550
+ D.docTabBadge.textContent=state.fileList.length;
551
+ }catch(e){}
552
+ }
553
+
554
+ function switchTab(name){
555
+ $$('.tab-btn').forEach(x=>x.classList.remove('active'));
556
+ $$('.tab-panel').forEach(x=>x.classList.remove('active'));
557
+ const btn=document.querySelector(`[data-tab="${name}"]`);
558
+ if(btn)btn.classList.add('active');
559
+ const panel=$('#tab-'+name);
560
+ if(panel)panel.classList.add('active');
561
+ if(name==='status')refreshStatus();
562
+ if(name==='preview')refreshFileList();
563
+ }
564
+
565
+ async function deleteDoc(idx){
566
+ try{const r=await fetch('/api/files/'+idx,{method:'DELETE'});if(r.ok){if(state.selectedDoc===idx)state.selectedDoc=-1;refreshFileList();refreshStatus();showToast('File removed','success')}else showToast('Failed to remove','error')}catch(e){showToast(e.message,'error')}
567
+ }
568
+
569
+ // ─── Status ─────────────────────────────────────────────
570
+ async function refreshStatus(){
571
+ try{const r=await fetch('/api/status');const d=await r.json();
572
+ if(d.ocr){$('#sOCRModel').textContent=d.ocr.model||'PaddleOCR-VL-1.5';$('#sOCRBase').textContent=d.ocr.api_base||(d.ocr.engine==='paddle'?'local':'')}
573
+ if(d.embedding){$('#sEmbedModel').textContent=d.embedding.model||'���';$('#sEmbedBase').textContent=d.embedding.api_base||''}
574
+ if(d.llm){$('#sLLMModel').textContent=d.llm.model||'—';$('#sLLMBase').textContent=d.llm.api_base||''}
575
+ if(d.vector_store)$('#sVectorDB').textContent=d.vector_store;
576
+ if(d.params){$('#sChunkSize').value=d.params.chunk_size;$('#sChunkSizeVal').textContent=d.params.chunk_size;$('#sChunkOverlap').value=d.params.chunk_overlap;$('#sChunkOverlapVal').textContent=d.params.chunk_overlap;$('#sRetrievalK').value=d.params.retrieval_top_k}
577
+ if(d.document_count!==undefined)$('#sDBStats').innerHTML=`<div class="param-row"><span class="param-label">Indexed Chunks</span><span class="param-value">${d.document_count}</span></div>`;
578
+ $('#configContent').innerHTML=`<div class="config-readonly">OCR: ${d.ocr.engine} | ${d.ocr.model||'—'} ${d.ocr.api_key?'| key: '+d.ocr.api_key:''}</div><div class="config-readonly">Embed: ${d.embedding.model} ${d.embedding.api_key?'| key: '+d.embedding.api_key:''}</div><div class="config-readonly">LLM: ${d.llm.model} ${d.llm.api_key?'| key: '+d.llm.api_key:''}</div>`;
579
+ const cr=await fetch('/api/config');const c=await cr.json();
580
+ if(c.ocr){$('#cfgOcrEngine').value=c.ocr.engine||'paddle';$('#cfgOcrBase').value=c.ocr.api_base||'';$('#cfgOcrKey').value=c.ocr.api_key||'';$('#cfgOcrModel').value=c.ocr.model_name||''}
581
+ if(c.embedding){$('#cfgEmbedBase').value=c.embedding.api_base||'';$('#cfgEmbedKey').value=c.embedding.api_key||'';$('#cfgEmbedModel').value=c.embedding.model_name||''}
582
+ if(c.llm){$('#cfgLLMBase').value=c.llm.api_base||'';$('#cfgLLMKey').value=c.llm.api_key||'';$('#cfgLLMModel').value=c.llm.model_name||''}
583
+ }catch(e){}
584
+ }
585
+
586
+ // ─── Params ─────────────────────────────────────────────
587
+ $('#sChunkSize').addEventListener('input',()=>$('#sChunkSizeVal').textContent=$('#sChunkSize').value);
588
+ $('#sChunkOverlap').addEventListener('input',()=>$('#sChunkOverlapVal').textContent=$('#sChunkOverlap').value);
589
+ async function saveProcessingParams(){
590
+ const u={CHUNK_SIZE:$('#sChunkSize').value,CHUNK_OVERLAP:$('#sChunkOverlap').value,RETRIEVAL_TOP_K:$('#sRetrievalK').value};
591
+ try{const r=await fetch('/api/config',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(u)});const d=await r.json();
592
+ if(d.success){$('#paramsMsg').innerHTML='<span style="color:var(--accent-green)">Saved</span>';setTimeout(()=>$('#paramsMsg').innerHTML='',2000)}else $('#paramsMsg').innerHTML='<span style="color:var(--accent-red)">Failed</span>';
593
+ }catch(e){$('#paramsMsg').innerHTML='<span style="color:var(--accent-red)">'+e.message+'</span>'}
594
+ }
595
+
596
+ // ─── Chat ───────────────────────────────────────────────
597
+ function addMessage(role,content){D.chatEmpty.style.display='none';const d=document.createElement('div');d.className='message '+role;d.innerHTML=`<div class="msg-avatar">${role==='user'?'You':'AI'}</div><div class="msg-bubble">${fmt(content)}</div>`;D.chatMessages.appendChild(d);D.chatMessages.scrollTop=D.chatMessages.scrollHeight}
598
+ function addTyping(){const d=document.createElement('div');d.className='message assistant';d.id='typingMsg';d.innerHTML='<div class="msg-avatar">AI</div><div class="msg-bubble"><div class="typing-indicator"><span></span><span></span><span></span></div></div>';D.chatMessages.appendChild(d);D.chatMessages.scrollTop=D.chatMessages.scrollHeight}
599
+ function remTyping(){const e=document.getElementById('typingMsg');if(e)e.remove()}
600
+ function fmt(t){if(!t)return'';let h=t.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/\*\*(.+?)\*\*/g,'<strong>$1</strong>').replace(/\*(.+?)\*/g,'<em>$1</em>').replace(/`([^`]+)`/g,'<code>$1</code>');return'<p>'+h.replace(/\n\n/g,'</p><p>').replace(/\n/g,'<br>')+'</p>'}
601
+ async function sendQuestion(){
602
+ const q=D.questionInput.value.trim();if(!q||!state.ready)return;
603
+ D.questionInput.value='';D.sendBtn.disabled=true;D.questionInput.style.height='auto';
604
+ addMessage('user',q);addTyping();
605
+ try{const r=await fetch('/api/chat',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({question:q})});if(!r.ok){const e=await r.json();throw new Error(e.detail||'Chat failed')}const d=await r.json();remTyping();addMessage('assistant',d.answer);
606
+ if(d.sources&&d.sources.length){const sh=d.sources.map(s=>`<div class="source-item"><div class="src-header"><span>#${s.rank}</span><span>${s.document||''}</span><span>Page ${s.page}</span><span>${s.content_type||''}</span></div><div class="src-excerpt">${esc((s.content||'').substring(0,200))}...</div></div>`).join('');D.sToggle.style.display='flex';D.sContent.innerHTML=sh}
607
+ }catch(e){remTyping();addMessage('assistant','Error: '+e.message);showToast(e.message,'error')}
608
+ finally{D.sendBtn.disabled=false;D.questionInput.focus()}
609
+ }
610
+ D.sendBtn.addEventListener('click',sendQuestion);
611
+ D.questionInput.addEventListener('keydown',e=>{if(e.key==='Enter'&&!e.shiftKey){e.preventDefault();sendQuestion()}});
612
+ D.questionInput.addEventListener('input',()=>{D.questionInput.style.height='auto';D.questionInput.style.height=Math.min(D.questionInput.scrollHeight,120)+'px'});
613
+ document.addEventListener('click',e=>{if(e.target.classList.contains('quick-prompt')){D.questionInput.value=e.target.dataset.question;D.questionInput.focus();D.questionInput.style.height='auto';D.questionInput.style.height=Math.min(D.questionInput.scrollHeight,120)+'px'}});
614
+ D.sToggle.addEventListener('click',()=>{D.sContent.classList.toggle('open');D.sToggle.textContent=D.sContent.classList.contains('open')?'↑ Hide Sources & References':'↓ Sources & References'});
615
+
616
+ // ─── Tabs ───────────────────────────────────────────────
617
+ $$('.tab-btn').forEach(b=>b.addEventListener('click',()=>{$$('.tab-btn').forEach(x=>x.classList.remove('active'));$$('.tab-panel').forEach(x=>x.classList.remove('active'));b.classList.add('active');const t=$('#tab-'+b.dataset.tab);if(t)t.classList.add('active');if(b.dataset.tab==='status')refreshStatus();if(b.dataset.tab==='preview')refreshFileList()}));
618
+
619
+ // ─── Config ─────────────────────────────────────────────
620
+ function togglePassword(id,btn){const i=document.getElementById(id);if(i.type==='password'){i.type='text';btn.textContent='—'}else{i.type='password';btn.textContent='👁'}}
621
+ function toggleConfigEdit(){const d=$('#configDisplay'),e=$('#configEdit'),b=$('#editConfigBtn');if(e.style.display==='none'){d.style.display='none';e.style.display='block';b.textContent='Cancel';refreshStatus()}else cancelConfigEdit()}
622
+ function cancelConfigEdit(){$('#configDisplay').style.display='block';$('#configEdit').style.display='none';$('#editConfigBtn').textContent='Edit';$('#configMsg').innerHTML=''}
623
+ async function saveConfig(){const u={};u['OCR_ENGINE']=$('#cfgOcrEngine').value;u['OCR_API_BASE']=$('#cfgOcrBase').value;u['OCR_API_KEY']=$('#cfgOcrKey').value;u['OCR_API_MODEL']=$('#cfgOcrModel').value;u['EMBEDDING_API_BASE']=$('#cfgEmbedBase').value;u['EMBEDDING_API_KEY']=$('#cfgEmbedKey').value;u['EMBEDDING_MODEL_NAME']=$('#cfgEmbedModel').value;u['LLM_API_BASE']=$('#cfgLLMBase').value;u['LLM_API_KEY']=$('#cfgLLMKey').value;u['LLM_MODEL_NAME']=$('#cfgLLMModel').value;
624
+ try{const r=await fetch('/api/config',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(u)});const d=await r.json();if(d.success){$('#configMsg').innerHTML='<span style="color:var(--accent-green)">Config saved. Restart to apply.</span>';setTimeout(()=>{cancelConfigEdit();refreshStatus()},1500)}else $('#configMsg').innerHTML='<span style="color:var(--accent-red)">Save failed</span>'}catch(e){$('#configMsg').innerHTML='<span style="color:var(--accent-red)">'+e.message+'</span>'}
625
+ }
626
+ document.getElementById('editConfigBtn').addEventListener('click',toggleConfigEdit);
627
+
628
+ // ─── Clear chat ─────────────────────────────────────────
629
+ async function clearChat(){try{await fetch('/api/chat',{method:'DELETE'})}catch(e){}D.chatMessages.innerHTML='';D.chatMessages.appendChild(D.chatEmpty);D.chatEmpty.style.display='flex';D.sToggle.style.display='none';D.sContent.innerHTML='';D.sContent.classList.remove('open');D.questionInput.value='';D.questionInput.focus()}
630
+ document.addEventListener('keydown',e=>{if(e.ctrlKey&&e.shiftKey&&e.key==='K'){e.preventDefault();clearChat()}});
631
+ D.chatMessages.addEventListener('contextmenu',e=>{e.preventDefault();if(confirm('Clear all chat history?'))clearChat()});
632
+
633
+ // ─── Init ───────────────────────────────────────────────
634
+ renderQueue();refreshStatus();refreshFileList();
635
+ </script>
636
+ </body>
637
+ </html>
test.png ADDED

Git LFS Details

  • SHA256: 5d897dbc38348df6b12db9afac20549b78a004e1746b4b4a64ad40f1651a0abd
  • Pointer size: 131 Bytes
  • Size of remote file: 121 kB
text_processor.py ADDED
@@ -0,0 +1,606 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ 文本处理模块: Markdown 清洗 + 智能分割 (Chunking)
4
+ ============================================================
5
+ 适配 PaddleOCR-VL-1.5 输出的 Markdown 格式文本
6
+
7
+ 功能:
8
+ 1. Markdown 文本清洗 (保留表格/公式结构)
9
+ 2. 基于 LangChain 的语义感知分割
10
+ 3. 表格/公式专项处理
11
+ """
12
+
13
+ import re
14
+ from typing import List, Optional, Callable
15
+
16
+ from langchain_core.documents import Document
17
+
18
+ from loguru import logger
19
+
20
+ import config
21
+
22
+
23
+ # ============================================================
24
+ # 内置递归文本分割器 (替代 langchain_text_splitters)
25
+ # ============================================================
26
+ # 避免 langchain_text_splitters → sentence_transformers → transformers
27
+ # 的传递依赖链在部分环境中导致的兼容性问题
28
+
29
+
30
+ class RecursiveCharacterTextSplitter:
31
+ """
32
+ 递归字符文本分割器
33
+
34
+ 与 langchain_text_splitters.RecursiveCharacterTextSplitter 接口兼容,
35
+ 按分隔符优先级逐级分割, 保持语义完整性。
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ chunk_size: int = 800,
41
+ chunk_overlap: int = 150,
42
+ separators: Optional[List[str]] = None,
43
+ add_start_index: bool = True,
44
+ length_function: Callable[[str], int] = len,
45
+ keep_separator: bool = True,
46
+ strip_whitespace: bool = True,
47
+ ):
48
+ self.chunk_size = chunk_size
49
+ self.chunk_overlap = chunk_overlap
50
+ self.separators = separators or ["\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";", " ", ""]
51
+ self.add_start_index = add_start_index
52
+ self.length_function = length_function
53
+ self.keep_separator = keep_separator
54
+ self.strip_whitespace = strip_whitespace
55
+
56
+ def split_documents(self, documents: List[Document]) -> List[Document]:
57
+ """分割 Document 列表"""
58
+ chunks = []
59
+ for doc in documents:
60
+ doc_chunks = self.split_text(doc.page_content, doc.metadata)
61
+ chunks.extend(doc_chunks)
62
+ return chunks
63
+
64
+ def split_text(self, text: str, metadata: Optional[dict] = None) -> List[Document]:
65
+ """分割单个文本, 返回 Document 列表"""
66
+ metadata = metadata or {}
67
+ splits = self._split(text, self.separators)
68
+ chunks = self._merge(splits)
69
+
70
+ docs = []
71
+ for i, chunk in enumerate(chunks):
72
+ chunk_meta = {**metadata}
73
+ if self.add_start_index:
74
+ chunk_meta["start_index"] = text.find(chunk) if chunk in text else 0
75
+ docs.append(Document(page_content=chunk, metadata=chunk_meta))
76
+ return docs
77
+
78
+ def create_documents(
79
+ self, texts: List[str], metadatas: Optional[List[dict]] = None
80
+ ) -> List[Document]:
81
+ """从文本列表创建 Document 列表"""
82
+ metadatas = metadatas or [{}] * len(texts)
83
+ docs = []
84
+ for text, meta in zip(texts, metadatas):
85
+ docs.extend(self.split_text(text, meta))
86
+ return docs
87
+
88
+ def _split(self, text: str, separators: List[str]) -> List[str]:
89
+ """递归分割"""
90
+ # 使用最合适的分隔符
91
+ sep = separators[-1] # 默认用最后一个 (空字符串, 按字符分割)
92
+ for s in separators:
93
+ if s == "":
94
+ sep = s
95
+ break
96
+ if s in text:
97
+ sep = s
98
+ break
99
+
100
+ # 按分隔符分割
101
+ if sep == "":
102
+ # 按字符分割
103
+ splits = list(text)
104
+ else:
105
+ if self.keep_separator:
106
+ # 保留分隔符在片段末尾
107
+ parts = text.split(sep)
108
+ splits = []
109
+ for i, part in enumerate(parts):
110
+ if i > 0:
111
+ splits.append(sep + part)
112
+ else:
113
+ splits.append(part)
114
+ else:
115
+ splits = text.split(sep)
116
+
117
+ # 去除空白并过滤空字符串
118
+ if self.strip_whitespace:
119
+ splits = [s.strip() for s in splits]
120
+ splits = [s for s in splits if s]
121
+
122
+ # 递归处理超长片段
123
+ final_splits = []
124
+ for split in splits:
125
+ if self.length_function(split) <= self.chunk_size:
126
+ final_splits.append(split)
127
+ else:
128
+ # 片段仍超长, 用下一级分隔符递归分割
129
+ if len(separators) > 1:
130
+ next_seps = separators[separators.index(sep) + 1 :]
131
+ final_splits.extend(self._split(split, next_seps))
132
+ else:
133
+ # 无法再分, 强制按字符切分
134
+ forced = self._force_split(split)
135
+ final_splits.extend(forced)
136
+
137
+ return final_splits
138
+
139
+ def _force_split(self, text: str) -> List[str]:
140
+ """强制按字符数切分 (兜底)"""
141
+ chunks = []
142
+ for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
143
+ chunk = text[i : i + self.chunk_size]
144
+ if self.strip_whitespace:
145
+ chunk = chunk.strip()
146
+ if chunk:
147
+ chunks.append(chunk)
148
+ return chunks
149
+
150
+ def _merge(self, splits: List[str]) -> List[str]:
151
+ """合并短片段为 chunk_size 大小的块"""
152
+ if not splits:
153
+ return []
154
+
155
+ chunks = []
156
+ current = ""
157
+ current_len = 0
158
+
159
+ for split in splits:
160
+ split_len = self.length_function(split)
161
+
162
+ if current_len + split_len <= self.chunk_size:
163
+ if current:
164
+ current += "\n\n" + split
165
+ current_len += 2 + split_len
166
+ else:
167
+ current = split
168
+ current_len = split_len
169
+ else:
170
+ if current:
171
+ chunks.append(current)
172
+ # 重叠: 保留前一块的尾部
173
+ if self.chunk_overlap > 0 and current:
174
+ overlap_text = current[-self.chunk_overlap:]
175
+ current = overlap_text + "\n\n" + split
176
+ current_len = self.length_function(current)
177
+ else:
178
+ current = split
179
+ current_len = split_len
180
+
181
+ if current:
182
+ chunks.append(current)
183
+
184
+ return chunks
185
+
186
+
187
+ # ============================================================
188
+ # Markdown 文本清洗器
189
+ # ============================================================
190
+
191
+ class MarkdownTextCleaner:
192
+ """PaddleOCR-VL-1.5 Markdown 输出清洗"""
193
+
194
+ @staticmethod
195
+ def clean(text: str, preserve_structure: bool = True) -> str:
196
+ """
197
+ 清洗 Markdown 文本
198
+ - 保留表格 (|...|) 和公式 ($...$ / $$...$$)
199
+ - 规范化空白和换行
200
+ - 移除 OCR 残留噪声
201
+ """
202
+ if not text:
203
+ return ""
204
+
205
+ cleaned = text.strip()
206
+
207
+ # 移除控制字符 (保留换行和制表符)
208
+ cleaned = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', cleaned)
209
+
210
+ # 统一换行符
211
+ cleaned = cleaned.replace('\r\n', '\n').replace('\r', '\n')
212
+
213
+ # 规范化空白 (但不影响表格结构)
214
+ if preserve_structure:
215
+ # 保护表格行和代码块
216
+ lines = cleaned.split('\n')
217
+ cleaned_lines = []
218
+ in_table = False
219
+ in_code = False
220
+
221
+ for line in lines:
222
+ # 检测 Markdown 表格
223
+ if line.strip().startswith('|') and '|' in line.strip()[1:]:
224
+ in_table = True
225
+ cleaned_lines.append(line.rstrip())
226
+ elif in_table and re.match(r'^[\s\|:\-]+$', line):
227
+ # 表格分隔行
228
+ cleaned_lines.append(line.rstrip())
229
+ elif in_table and not line.strip().startswith('|'):
230
+ in_table = False
231
+ if line.strip():
232
+ cleaned_lines.append(line.strip())
233
+ elif cleaned_lines and cleaned_lines[-1] != '':
234
+ cleaned_lines.append('')
235
+ elif line.strip().startswith('```'):
236
+ in_code = not in_code
237
+ cleaned_lines.append(line.rstrip())
238
+ elif in_code:
239
+ cleaned_lines.append(line.rstrip())
240
+ else:
241
+ # 普通行: 去除首尾空白, 合并多个空格
242
+ stripped = re.sub(r' +', ' ', line.strip())
243
+ if stripped:
244
+ cleaned_lines.append(stripped)
245
+ elif cleaned_lines and cleaned_lines[-1] != '':
246
+ cleaned_lines.append('')
247
+
248
+ cleaned = '\n'.join(cleaned_lines)
249
+ else:
250
+ cleaned = re.sub(r' +', ' ', cleaned)
251
+ cleaned = re.sub(r' *\n *', '\n', cleaned)
252
+
253
+ # 压缩过多连续空行
254
+ cleaned = re.sub(r'\n{4,}', '\n\n\n', cleaned)
255
+
256
+ return cleaned.strip()
257
+
258
+ @staticmethod
259
+ def clean_documents(documents: List[Document]) -> List[Document]:
260
+ """批量清洗 Document 列表"""
261
+ cleaned_docs = []
262
+ for doc in documents:
263
+ original_len = len(doc.page_content)
264
+ cleaned_text = MarkdownTextCleaner.clean(doc.page_content)
265
+ cleaned_len = len(cleaned_text)
266
+
267
+ if cleaned_text:
268
+ cleaned_doc = Document(
269
+ page_content=cleaned_text,
270
+ metadata={
271
+ **doc.metadata,
272
+ "cleaned": True,
273
+ "original_length": original_len,
274
+ "cleaned_length": cleaned_len,
275
+ },
276
+ )
277
+ cleaned_docs.append(cleaned_doc)
278
+ else:
279
+ logger.debug(
280
+ f"页面 {doc.metadata.get('page', '?')} 清洗后为空, 已跳过"
281
+ )
282
+
283
+ logger.info(
284
+ f"文本清洗: {len(documents)} → {len(cleaned_docs)} 个文档 "
285
+ f"(移除 {len(documents) - len(cleaned_docs)} 个空白页)"
286
+ )
287
+ return cleaned_docs
288
+
289
+ @staticmethod
290
+ def extract_tables_as_chunks(documents: List[Document]) -> List[Document]:
291
+ """
292
+ 将 Markdown 表格提取为独立的文本块
293
+ PaddleOCR-VL-1.5 已输出标准 Markdown 表格格式
294
+ """
295
+ table_docs = []
296
+ for doc in documents:
297
+ tables_html = doc.metadata.get("tables_html", [])
298
+ tables_md = doc.metadata.get("tables_markdown", [])
299
+
300
+ for i, (html, md) in enumerate(
301
+ zip(tables_html, tables_md or [""] * len(tables_html))
302
+ ):
303
+ content = md or html
304
+ if content.strip():
305
+ table_doc = Document(
306
+ page_content=f"[表格数据]\n{content}",
307
+ metadata={
308
+ **doc.metadata,
309
+ "content_type": "table",
310
+ "table_index": i,
311
+ "table_html": html,
312
+ "table_markdown": md,
313
+ },
314
+ )
315
+ table_docs.append(table_doc)
316
+
317
+ if table_docs:
318
+ logger.info(f"提取了 {len(table_docs)} 个表格块")
319
+ return table_docs
320
+
321
+ @staticmethod
322
+ def extract_formulas_as_chunks(documents: List[Document]) -> List[Document]:
323
+ """将 LaTeX 公式提取为独立块"""
324
+ formula_docs = []
325
+ for doc in documents:
326
+ formulas_latex = doc.metadata.get("formulas_latex", [])
327
+ for i, latex in enumerate(formulas_latex):
328
+ if latex.strip():
329
+ formula_doc = Document(
330
+ page_content=f"[公式]\n$${latex}$$",
331
+ metadata={
332
+ **doc.metadata,
333
+ "content_type": "formula",
334
+ "formula_index": i,
335
+ "formula_latex": latex,
336
+ },
337
+ )
338
+ formula_docs.append(formula_doc)
339
+
340
+ if formula_docs:
341
+ logger.info(f"提取了 {len(formula_docs)} 个公式块")
342
+ return formula_docs
343
+
344
+
345
+ # ============================================================
346
+ # 智能文本分割器
347
+ # ============================================================
348
+
349
+ class DocumentSplitter:
350
+ """
351
+ 文档智能分割器
352
+
353
+ 针对 PaddleOCR-VL-1.5 的 Markdown 输出优化:
354
+ - 在 Markdown 标题处分段
355
+ - 保护表格完整性
356
+ - 保护代码块完整性
357
+ """
358
+
359
+ def __init__(
360
+ self,
361
+ chunk_size: int = config.CHUNK_SIZE,
362
+ chunk_overlap: int = config.CHUNK_OVERLAP,
363
+ separators: Optional[List[str]] = None,
364
+ ):
365
+ self.chunk_size = chunk_size
366
+ self.chunk_overlap = chunk_overlap
367
+ self.separators = separators or config.SEPARATORS
368
+
369
+ self._splitter = RecursiveCharacterTextSplitter(
370
+ chunk_size=chunk_size,
371
+ chunk_overlap=chunk_overlap,
372
+ separators=self.separators,
373
+ add_start_index=True,
374
+ length_function=len,
375
+ keep_separator=True,
376
+ strip_whitespace=True,
377
+ )
378
+
379
+ def split_documents(self, documents: List[Document]) -> List[Document]:
380
+ """分割文档列表"""
381
+ if not documents:
382
+ return []
383
+
384
+ chunks = self._splitter.split_documents(documents)
385
+ logger.info(
386
+ f"文本分割: {len(documents)} → {len(chunks)} 个文本块 "
387
+ f"(块大小={self.chunk_size}, 重叠={self.chunk_overlap})"
388
+ )
389
+ return chunks
390
+
391
+ def split_text(self, text: str, metadata: Optional[dict] = None) -> List[Document]:
392
+ """分割单个文本"""
393
+ return self._splitter.create_documents(
394
+ [text], metadatas=[metadata or {}]
395
+ )
396
+
397
+
398
+ class MarkdownAwareSplitter:
399
+ """
400
+ Markdown 感知分割器
401
+
402
+ 在 Markdown 结构边界处分割:
403
+ - ## 标题 → 新段
404
+ - 表格 → 保持完整
405
+ - 代码块 → 保持完整
406
+ """
407
+
408
+ def __init__(
409
+ self,
410
+ target_chunk_size: int = config.CHUNK_SIZE,
411
+ min_chunk_size: int = 100,
412
+ ):
413
+ self.target_chunk_size = target_chunk_size
414
+ self.min_chunk_size = min_chunk_size
415
+
416
+ def split_documents(self, documents: List[Document]) -> List[Document]:
417
+ """基于 Markdown 结构分割"""
418
+ all_chunks = []
419
+
420
+ for doc in documents:
421
+ sections = self._split_by_headers(doc.page_content)
422
+ chunks = self._merge_sections(
423
+ sections, doc.metadata, self.target_chunk_size, self.min_chunk_size
424
+ )
425
+ all_chunks.extend(chunks)
426
+
427
+ logger.info(
428
+ f"Markdown 感知分割: {len(documents)} → {len(all_chunks)} 个文本块"
429
+ )
430
+ return all_chunks
431
+
432
+ @staticmethod
433
+ def _split_by_headers(text: str) -> List[str]:
434
+ """
435
+ 按 Markdown 标题 (# ## ###) 和段落分割
436
+ 保护表格和代码块完整性
437
+ """
438
+ # 先在代码块和表格处做保护标记
439
+ protected = []
440
+ protection_map = {}
441
+
442
+ def protect(match):
443
+ key = f"__PROTECTED_{len(protected)}__"
444
+ protected.append(match.group(0))
445
+ protection_map[key] = match.group(0)
446
+ return key
447
+
448
+ # 保护代码块
449
+ text = re.sub(r'```[\s\S]*?```', protect, text)
450
+ # 保护表格 (连续的 | 行)
451
+ text = re.sub(
452
+ r'(?:^\|.+\|\n)+(?:^\|[\s\-:]+\|\n)?(?:^\|.+\|\n?)+',
453
+ protect,
454
+ text,
455
+ flags=re.MULTILINE,
456
+ )
457
+
458
+ # 按 Markdown 标题分割
459
+ raw_sections = re.split(r'\n(?=#{1,3}\s)', text)
460
+
461
+ # 恢复保护的内容
462
+ sections = []
463
+ for section in raw_sections:
464
+ for key, original in protection_map.items():
465
+ section = section.replace(key, original)
466
+ section = section.strip()
467
+ if section:
468
+ sections.append(section)
469
+
470
+ return sections
471
+
472
+ @staticmethod
473
+ def _merge_sections(
474
+ sections: List[str],
475
+ base_metadata: dict,
476
+ target_size: int,
477
+ min_size: int,
478
+ ) -> List[Document]:
479
+ """将段落合并为目标大小的块"""
480
+ chunks = []
481
+ current = ""
482
+ start_idx = 0
483
+
484
+ for i, section in enumerate(sections):
485
+ if not current:
486
+ current = section
487
+ start_idx = i
488
+ elif len(current) + len(section) + 2 <= target_size:
489
+ current += "\n\n" + section
490
+ else:
491
+ if len(current) >= min_size:
492
+ meta = {
493
+ **base_metadata,
494
+ "chunk_sections": f"{start_idx}-{i - 1}",
495
+ "chunk_type": "markdown_semantic",
496
+ }
497
+ chunks.append(Document(page_content=current, metadata=meta))
498
+ current = section
499
+ start_idx = i
500
+
501
+ # 最后一个块
502
+ if current and len(current) >= min_size:
503
+ meta = {
504
+ **base_metadata,
505
+ "chunk_sections": f"{start_idx}-{len(sections) - 1}",
506
+ "chunk_type": "markdown_semantic",
507
+ }
508
+ chunks.append(Document(page_content=current, metadata=meta))
509
+ elif current and chunks:
510
+ chunks[-1].page_content += "\n\n" + current
511
+
512
+ return chunks
513
+
514
+
515
+ # ============================================================
516
+ # 完整处理流水线
517
+ # ============================================================
518
+
519
+ class TextProcessingPipeline:
520
+ """
521
+ 文本处理流水线
522
+
523
+ 用法:
524
+ pipeline = TextProcessingPipeline()
525
+ chunks = pipeline.process(raw_documents)
526
+ """
527
+
528
+ def __init__(
529
+ self,
530
+ chunk_size: int = config.CHUNK_SIZE,
531
+ chunk_overlap: int = config.CHUNK_OVERLAP,
532
+ split_method: str = "recursive",
533
+ extract_tables: bool = True,
534
+ extract_formulas: bool = False,
535
+ clean_text: bool = True,
536
+ ):
537
+ self.chunk_size = chunk_size
538
+ self.chunk_overlap = chunk_overlap
539
+ self.split_method = split_method
540
+ self.extract_tables = extract_tables
541
+ self.extract_formulas = extract_formulas
542
+ self.clean_text = clean_text
543
+
544
+ if split_method == "markdown":
545
+ self.splitter = MarkdownAwareSplitter(
546
+ target_chunk_size=chunk_size,
547
+ min_chunk_size=max(50, chunk_size // 4),
548
+ )
549
+ else:
550
+ self.splitter = DocumentSplitter(
551
+ chunk_size=chunk_size,
552
+ chunk_overlap=chunk_overlap,
553
+ )
554
+
555
+ def process(self, documents: List[Document]) -> List[Document]:
556
+ """
557
+ 完整处理流水线:
558
+ 原始文档 → 清洗 → 提取表格/公式 → 分割 → 最终块
559
+ """
560
+ docs = list(documents)
561
+ logger.info(f"文本处理流水线启动: {len(docs)} 个原始文档")
562
+
563
+ # Step 1: 文本清洗
564
+ if self.clean_text:
565
+ docs = MarkdownTextCleaner.clean_documents(docs)
566
+
567
+ # Step 2: 提取表格和公式为独立块
568
+ extra_docs = []
569
+ if self.extract_tables:
570
+ extra_docs.extend(MarkdownTextCleaner.extract_tables_as_chunks(docs))
571
+ if self.extract_formulas:
572
+ extra_docs.extend(MarkdownTextCleaner.extract_formulas_as_chunks(docs))
573
+
574
+ # Step 3: 分割
575
+ chunks = self.splitter.split_documents(docs)
576
+
577
+ # Step 4: 合并特殊内容块
578
+ if extra_docs:
579
+ chunks.extend(extra_docs)
580
+ logger.info(f"合并特殊块后总计: {len(chunks)} 个文本块")
581
+
582
+ # Step 5: 添加块 ID
583
+ for i, chunk in enumerate(chunks):
584
+ chunk.metadata["chunk_id"] = f"chunk_{i:06d}"
585
+
586
+ logger.info(f"文本处理完成: {len(documents)} 页 → {len(chunks)} 个文本块")
587
+ return chunks
588
+
589
+
590
+ # ============================================================
591
+ # 便捷函数
592
+ # ============================================================
593
+
594
+ def process_documents(
595
+ documents: List[Document],
596
+ chunk_size: int = config.CHUNK_SIZE,
597
+ chunk_overlap: int = config.CHUNK_OVERLAP,
598
+ **kwargs,
599
+ ) -> List[Document]:
600
+ """便捷函数: 一键文本处理"""
601
+ pipeline = TextProcessingPipeline(
602
+ chunk_size=chunk_size,
603
+ chunk_overlap=chunk_overlap,
604
+ **kwargs,
605
+ )
606
+ return pipeline.process(documents)
vector_store.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ============================================================
3
+ 向量数据库存储模块
4
+ ============================================================
5
+ 嵌入模型: Qwen3-Embedding 系列
6
+ 向量数据库: Chroma / FAISS
7
+
8
+ 功能:
9
+ 1. 文档批量向量化入库
10
+ 2. 相似度检索 / MMR / 元数据过滤
11
+ 3. 持久化与增量更新
12
+ """
13
+
14
+ from pathlib import Path
15
+ from typing import List, Optional, Dict, Any, Callable
16
+
17
+ from langchain_core.documents import Document
18
+ from langchain_core.embeddings import Embeddings
19
+ from langchain_core.vectorstores import VectorStore
20
+
21
+ from langchain_community.vectorstores import Chroma, FAISS
22
+
23
+ from loguru import logger
24
+
25
+ import config
26
+ from embeddings import get_embedding_model
27
+
28
+
29
+ # ============================================================
30
+ # 向量数据库工厂
31
+ # ============================================================
32
+
33
+ class VectorStoreFactory:
34
+
35
+ @staticmethod
36
+ def create_chroma(
37
+ persist_directory: Optional[str | Path] = None,
38
+ collection_name: str = config.CHROMA_COLLECTION_NAME,
39
+ embedding_function: Optional[Embeddings] = None,
40
+ ) -> Chroma:
41
+ persist_dir = str(persist_directory or config.VECTOR_DB_DIR / "chroma")
42
+ embedding = embedding_function or get_embedding_model()
43
+
44
+ logger.info(f"创建 Chroma 向量数据库: {persist_dir} (集合: {collection_name})")
45
+
46
+ return Chroma(
47
+ collection_name=collection_name,
48
+ embedding_function=embedding,
49
+ persist_directory=persist_dir,
50
+ collection_metadata={
51
+ "hnsw:space": "cosine", # Qwen3-Embedding 使用余弦相似度
52
+ "hnsw:construction_ef": 200,
53
+ "hnsw:M": 48,
54
+ },
55
+ )
56
+
57
+ @staticmethod
58
+ def create_faiss(
59
+ embedding_function: Optional[Embeddings] = None,
60
+ ) -> FAISS:
61
+ embedding = embedding_function or get_embedding_model()
62
+ logger.info("创建 FAISS 向量数据库 (flat L2 index)")
63
+ # FAISS.from_documents 会创建合适的索引
64
+ return FAISS(
65
+ embedding_function=embedding,
66
+ index=None,
67
+ docstore=None,
68
+ index_to_docstore_id={},
69
+ )
70
+
71
+ @staticmethod
72
+ def create(store_type: Optional[str] = None, **kwargs) -> VectorStore:
73
+ store_type = store_type or config.VECTOR_STORE_TYPE
74
+ if store_type == "chroma":
75
+ return VectorStoreFactory.create_chroma(**kwargs)
76
+ elif store_type == "faiss":
77
+ return VectorStoreFactory.create_faiss(**kwargs)
78
+ else:
79
+ raise ValueError(f"不支持的向量数据库: {store_type}. 可选: chroma, faiss")
80
+
81
+
82
+ # ============================================================
83
+ # 向量数据库管理器
84
+ # ============================================================
85
+
86
+ class VectorStoreManager:
87
+
88
+ def __init__(
89
+ self,
90
+ vector_store: Optional[VectorStore] = None,
91
+ store_type: Optional[str] = None,
92
+ embedding_function: Optional[Embeddings] = None,
93
+ persist_directory: Optional[str | Path] = None,
94
+ ):
95
+ self.store_type = store_type or config.VECTOR_STORE_TYPE
96
+ self.embedding_function = embedding_function or get_embedding_model()
97
+ self.persist_directory = str(persist_directory or config.VECTOR_DB_DIR)
98
+ self._store = vector_store or self._init_store()
99
+
100
+ def _init_store(self) -> VectorStore:
101
+ if self.store_type == "chroma":
102
+ return self._init_chroma()
103
+ elif self.store_type == "faiss":
104
+ return self._init_faiss()
105
+ else:
106
+ raise ValueError(f"不支持的向量数据库: {self.store_type}")
107
+
108
+ def _init_chroma(self) -> Chroma:
109
+ persist_dir = Path(self.persist_directory) / "chroma"
110
+ if persist_dir.exists() and any(persist_dir.iterdir()):
111
+ logger.info(f"加载已有 Chroma 数据库: {persist_dir}")
112
+ return Chroma(
113
+ persist_directory=str(persist_dir),
114
+ embedding_function=self.embedding_function,
115
+ collection_name=config.CHROMA_COLLECTION_NAME,
116
+ )
117
+ return VectorStoreFactory.create_chroma(
118
+ persist_directory=str(persist_dir),
119
+ embedding_function=self.embedding_function,
120
+ )
121
+
122
+ def _init_faiss(self) -> FAISS:
123
+ index_path = Path(self.persist_directory) / "faiss_index"
124
+ if index_path.exists():
125
+ logger.info(f"加载已有 FAISS 数据库: {index_path}")
126
+ return FAISS.load_local(
127
+ str(index_path),
128
+ self.embedding_function,
129
+ allow_dangerous_deserialization=True,
130
+ )
131
+ return VectorStoreFactory.create_faiss(
132
+ embedding_function=self.embedding_function,
133
+ )
134
+
135
+ @property
136
+ def store(self) -> VectorStore:
137
+ return self._store
138
+
139
+ # ---- 入库 ----
140
+
141
+ def add_documents(
142
+ self,
143
+ documents: List[Document],
144
+ batch_size: int = 50,
145
+ progress_callback: Optional[Callable[[int, int], None]] = None,
146
+ ) -> int:
147
+ if not documents:
148
+ logger.warning("文档列表为空, 跳过入库")
149
+ return 0
150
+
151
+ total = len(documents)
152
+ logger.info(f"开始向量化入库: {total} 个文档块 (批大小={batch_size})")
153
+
154
+ for i in range(0, total, batch_size):
155
+ batch = documents[i : i + batch_size]
156
+ self._store.add_documents(batch)
157
+ if progress_callback:
158
+ progress_callback(min(i + batch_size, total), total)
159
+
160
+ self._persist()
161
+ logger.info(f"向量化入库完成: {total} 个文档块")
162
+ return total
163
+
164
+ def add_texts(
165
+ self,
166
+ texts: List[str],
167
+ metadatas: Optional[List[dict]] = None,
168
+ batch_size: int = 50,
169
+ ) -> List[str]:
170
+ if not texts:
171
+ return []
172
+ all_ids = []
173
+ for i in range(0, len(texts), batch_size):
174
+ batch_texts = texts[i : i + batch_size]
175
+ batch_metas = metadatas[i : i + batch_size] if metadatas else None
176
+ ids = self._store.add_texts(batch_texts, batch_metas)
177
+ all_ids.extend(ids)
178
+ self._persist()
179
+ return all_ids
180
+
181
+ # ---- 检索 ----
182
+
183
+ def similarity_search(
184
+ self,
185
+ query: str,
186
+ k: int = config.RETRIEVAL_TOP_K,
187
+ filter: Optional[Dict[str, Any]] = None,
188
+ **kwargs,
189
+ ) -> List[Document]:
190
+ if filter and isinstance(self._store, Chroma):
191
+ kwargs["filter"] = filter
192
+ return self._store.similarity_search(query, k=k, **kwargs)
193
+
194
+ def similarity_search_with_score(
195
+ self,
196
+ query: str,
197
+ k: int = config.RETRIEVAL_TOP_K,
198
+ filter: Optional[Dict[str, Any]] = None,
199
+ score_threshold: float = 0.3,
200
+ **kwargs,
201
+ ) -> List[tuple]:
202
+ if filter and isinstance(self._store, Chroma):
203
+ kwargs["filter"] = filter
204
+ raw = self._store.similarity_search_with_relevance_scores(
205
+ query, k=k, **kwargs
206
+ )
207
+ # Qwen3-Embedding 余弦相似度通常 > 0.5 为相关
208
+ return [(doc, score) for doc, score in raw if score >= score_threshold]
209
+
210
+ def max_marginal_relevance_search(
211
+ self,
212
+ query: str,
213
+ k: int = config.RETRIEVAL_TOP_K,
214
+ fetch_k: int = 20,
215
+ lambda_mult: float = 0.5,
216
+ filter: Optional[Dict[str, Any]] = None,
217
+ ) -> List[Document]:
218
+ if filter and isinstance(self._store, Chroma):
219
+ return self._store.max_marginal_relevance_search(
220
+ query, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, filter=filter,
221
+ )
222
+ return self._store.max_marginal_relevance_search(
223
+ query, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult,
224
+ )
225
+
226
+ # ---- 过滤查询 ----
227
+
228
+ def search_by_document(
229
+ self, query: str, document_name: str, k: int = config.RETRIEVAL_TOP_K
230
+ ) -> List[Document]:
231
+ return self.similarity_search(query, k=k, filter={"document_name": document_name})
232
+
233
+ def search_by_page_range(
234
+ self, query: str, start_page: int, end_page: int,
235
+ k: int = config.RETRIEVAL_TOP_K,
236
+ ) -> List[Document]:
237
+ return self.similarity_search(
238
+ query, k=k, filter={"page": {"$gte": start_page, "$lte": end_page}}
239
+ )
240
+
241
+ # ---- 管理 ----
242
+
243
+ def _persist(self):
244
+ if self.store_type == "faiss":
245
+ index_path = Path(self.persist_directory) / "faiss_index"
246
+ index_path.mkdir(parents=True, exist_ok=True)
247
+ self._store.save_local(str(index_path))
248
+
249
+ def clear(self):
250
+ if self.store_type == "chroma":
251
+ self._store.delete_collection()
252
+ self._store = VectorStoreFactory.create_chroma(
253
+ persist_directory=Path(self.persist_directory) / "chroma",
254
+ embedding_function=self.embedding_function,
255
+ )
256
+ elif self.store_type == "faiss":
257
+ self._store = VectorStoreFactory.create_faiss(
258
+ embedding_function=self.embedding_function,
259
+ )
260
+ logger.info("向量数据库已清空")
261
+
262
+ def get_document_count(self) -> int:
263
+ try:
264
+ if self.store_type == "chroma":
265
+ return self._store._collection.count()
266
+ elif self.store_type == "faiss":
267
+ return self._store.index.ntotal if self._store.index else 0
268
+ except Exception:
269
+ return 0
270
+
271
+ def get_stats(self) -> Dict[str, Any]:
272
+ return {
273
+ "store_type": self.store_type,
274
+ "persist_directory": self.persist_directory,
275
+ "document_count": self.get_document_count(),
276
+ "embedding_model": config.EMBEDDING_MODEL_NAME,
277
+ }
278
+
279
+
280
+ # ============================================================
281
+ # 便捷函数
282
+ # ============================================================
283
+
284
+ def build_vector_store(
285
+ documents: List[Document],
286
+ store_type: Optional[str] = None,
287
+ embedding_model: Optional[Embeddings] = None,
288
+ clear_existing: bool = False,
289
+ ) -> VectorStoreManager:
290
+ manager = VectorStoreManager(
291
+ store_type=store_type,
292
+ embedding_function=embedding_model,
293
+ )
294
+ if clear_existing:
295
+ manager.clear()
296
+ manager.add_documents(documents)
297
+ return manager
298
+
299
+
300
+ def load_vector_store(
301
+ store_type: Optional[str] = None,
302
+ embedding_model: Optional[Embeddings] = None,
303
+ ) -> VectorStoreManager:
304
+ return VectorStoreManager(
305
+ store_type=store_type,
306
+ embedding_function=embedding_model,
307
+ )
国药准字H37020386_布洛芬片.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22954cb51781e685a42d5dd1abac0bde98906af75a6097871e3a937bdeaa4cdf
3
+ size 125946