Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
2cb7544
1
Parent(s):
90b33eb
resolve conflict
Browse files- BATCH_FREEZE_FIX.md +395 -0
- GRAPHRAG_TROUBLESHOOTING.md +328 -0
- entity_extractor.py +78 -27
- graph_indexer.py +17 -4
BATCH_FREEZE_FIX.md
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 批次处理卡住问题 - 修复总结
|
| 2 |
+
|
| 3 |
+
## 问题描述
|
| 4 |
+
用户报告在处理第6批次时,GraphRAG索引过程在提取实体6次后卡住,没有错误信息。
|
| 5 |
+
|
| 6 |
+
## 根本原因分析
|
| 7 |
+
|
| 8 |
+
### 1. **LLM超时问题** (最可能)
|
| 9 |
+
- Ollama服务在处理某些复杂文档时可能超时
|
| 10 |
+
- 没有设置timeout,导致请求无限期挂起
|
| 11 |
+
- 缺少重试机制
|
| 12 |
+
|
| 13 |
+
### 2. **资源耗尽**
|
| 14 |
+
- 连续处理多个批次后,Ollama可能积累内存
|
| 15 |
+
- 连接池可能耗尽
|
| 16 |
+
|
| 17 |
+
### 3. **错误处理不足**
|
| 18 |
+
- 异常没有被捕获,导致静默失败
|
| 19 |
+
- 缺少详细的进度日志,难以诊断
|
| 20 |
+
|
| 21 |
+
## 实施的修复
|
| 22 |
+
|
| 23 |
+
### ✅ 修复 1: 添加超时和重试机制
|
| 24 |
+
|
| 25 |
+
**文件**: `entity_extractor.py`
|
| 26 |
+
|
| 27 |
+
**改动**:
|
| 28 |
+
```python
|
| 29 |
+
# 之前
|
| 30 |
+
class EntityExtractor:
|
| 31 |
+
def __init__(self):
|
| 32 |
+
self.llm = ChatOllama(model=LOCAL_LLM, format="json", temperature=0)
|
| 33 |
+
|
| 34 |
+
# 之后
|
| 35 |
+
class EntityExtractor:
|
| 36 |
+
def __init__(self, timeout: int = 60, max_retries: int = 3):
|
| 37 |
+
self.llm = ChatOllama(
|
| 38 |
+
model=LOCAL_LLM,
|
| 39 |
+
format="json",
|
| 40 |
+
temperature=0,
|
| 41 |
+
timeout=timeout # 60秒超时
|
| 42 |
+
)
|
| 43 |
+
self.max_retries = max_retries
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
**效果**:
|
| 47 |
+
- 每次LLM调用最多等待60秒
|
| 48 |
+
- 超时后自动重试,最多3次
|
| 49 |
+
- 重试间隔递增(2秒、4秒、6秒)
|
| 50 |
+
|
| 51 |
+
### ✅ 修复 2: 改进的异常处理
|
| 52 |
+
|
| 53 |
+
**文件**: `entity_extractor.py`
|
| 54 |
+
|
| 55 |
+
**改动**:
|
| 56 |
+
```python
|
| 57 |
+
# 之前
|
| 58 |
+
def extract_entities(self, text: str) -> List[Dict]:
|
| 59 |
+
try:
|
| 60 |
+
result = self.entity_chain.invoke({"text": text[:2000]})
|
| 61 |
+
entities = result.get("entities", [])
|
| 62 |
+
return entities
|
| 63 |
+
except Exception as e:
|
| 64 |
+
print(f"❌ 实体提取失败: {e}")
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
# 之后
|
| 68 |
+
def extract_entities(self, text: str) -> List[Dict]:
|
| 69 |
+
for attempt in range(self.max_retries):
|
| 70 |
+
try:
|
| 71 |
+
print(f" 🔄 提取实体 (尝试 {attempt + 1}/{self.max_retries})...", end="")
|
| 72 |
+
result = self.entity_chain.invoke({"text": text[:2000]})
|
| 73 |
+
entities = result.get("entities", [])
|
| 74 |
+
print(f" ✅ 提取到 {len(entities)} 个实体")
|
| 75 |
+
return entities
|
| 76 |
+
except TimeoutError as e:
|
| 77 |
+
print(f" ⏱️ 超时")
|
| 78 |
+
if attempt < self.max_retries - 1:
|
| 79 |
+
wait_time = (attempt + 1) * 2
|
| 80 |
+
print(f" ⏳ 等待 {wait_time} 秒后重试...")
|
| 81 |
+
time.sleep(wait_time)
|
| 82 |
+
else:
|
| 83 |
+
print(f" ❌ 实体提取最终失败: 超时")
|
| 84 |
+
return []
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f" ❌ 错误: {str(e)[:100]}")
|
| 87 |
+
if attempt < self.max_retries - 1:
|
| 88 |
+
time.sleep(1)
|
| 89 |
+
else:
|
| 90 |
+
return []
|
| 91 |
+
return []
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
**效果**:
|
| 95 |
+
- 区分超时错误和其他错误
|
| 96 |
+
- 超时后等待并重试
|
| 97 |
+
- 显示详细的重试进度
|
| 98 |
+
- 最终失败后返回空列表,不崩溃
|
| 99 |
+
|
| 100 |
+
### ✅ 修复 3: 增强的进度跟踪
|
| 101 |
+
|
| 102 |
+
**文件**: `graph_indexer.py`
|
| 103 |
+
|
| 104 |
+
**改动**:
|
| 105 |
+
```python
|
| 106 |
+
# 之前
|
| 107 |
+
for i in range(0, len(documents), batch_size):
|
| 108 |
+
batch = documents[i:i+batch_size]
|
| 109 |
+
print(f" 处理批次 {i//batch_size + 1}...")
|
| 110 |
+
for doc in batch:
|
| 111 |
+
result = self.entity_extractor.extract_from_document(doc.page_content)
|
| 112 |
+
extraction_results.append(result)
|
| 113 |
+
|
| 114 |
+
# 之后
|
| 115 |
+
for i in range(0, len(documents), batch_size):
|
| 116 |
+
batch = documents[i:i+batch_size]
|
| 117 |
+
batch_num = i // batch_size + 1
|
| 118 |
+
total_batches = (len(documents) - 1) // batch_size + 1
|
| 119 |
+
print(f"\n⚙️ === 批次 {batch_num}/{total_batches} (文档 {i+1}-{min(i+batch_size, len(documents))}) ===")
|
| 120 |
+
|
| 121 |
+
for idx, doc in enumerate(batch):
|
| 122 |
+
doc_global_index = i + idx
|
| 123 |
+
try:
|
| 124 |
+
result = self.entity_extractor.extract_from_document(
|
| 125 |
+
doc.page_content,
|
| 126 |
+
doc_index=doc_global_index
|
| 127 |
+
)
|
| 128 |
+
extraction_results.append(result)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
print(f" ❌ 文档 #{doc_global_index + 1} 处理失败: {e}")
|
| 131 |
+
extraction_results.append({"entities": [], "relations": []})
|
| 132 |
+
|
| 133 |
+
print(f"✅ 批次 {batch_num}/{total_batches} 完成")
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
**效果**:
|
| 137 |
+
- 显示当前批次号和总批次数
|
| 138 |
+
- 显示正在处理的文档范围
|
| 139 |
+
- 每个文档的全局索引
|
| 140 |
+
- 批次级别的异常处理
|
| 141 |
+
- 失败后添加空结果继续处理
|
| 142 |
+
|
| 143 |
+
### ✅ 修复 4: 改进的日志输出
|
| 144 |
+
|
| 145 |
+
**文件**: `entity_extractor.py`
|
| 146 |
+
|
| 147 |
+
**改动**:
|
| 148 |
+
```python
|
| 149 |
+
# 之前
|
| 150 |
+
def extract_from_document(self, document_text: str) -> Dict:
|
| 151 |
+
print("🔍 开始提取实体...")
|
| 152 |
+
entities = self.extract_entities(document_text)
|
| 153 |
+
print("🔍 开始提取关系...")
|
| 154 |
+
relations = self.extract_relations(document_text, entities)
|
| 155 |
+
return {"entities": entities, "relations": relations}
|
| 156 |
+
|
| 157 |
+
# 之后
|
| 158 |
+
def extract_from_document(self, document_text: str, doc_index: int = 0) -> Dict:
|
| 159 |
+
print(f"\n🔍 文档 #{doc_index + 1}: 开始提取...")
|
| 160 |
+
|
| 161 |
+
entities = self.extract_entities(document_text)
|
| 162 |
+
relations = self.extract_relations(document_text, entities)
|
| 163 |
+
|
| 164 |
+
print(f"📊 文档 #{doc_index + 1} 完成: {len(entities)} 实体, {len(relations)} 关系")
|
| 165 |
+
|
| 166 |
+
return {"entities": entities, "relations": relations}
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
**效果**:
|
| 170 |
+
- 显示文档编号
|
| 171 |
+
- 汇总每个文档的提取结果
|
| 172 |
+
- 更容易定位卡住的具体文档
|
| 173 |
+
|
| 174 |
+
## 日志输出示例
|
| 175 |
+
|
| 176 |
+
### 之前的输出:
|
| 177 |
+
```
|
| 178 |
+
📍 步骤 1/5: 实体和关系提取
|
| 179 |
+
处理批次 6/10...
|
| 180 |
+
🔍 开始提取实体...
|
| 181 |
+
[卡住,没有更多输出]
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
### 现在的输出:
|
| 185 |
+
```
|
| 186 |
+
📍 步骤 1/5: 实体和关系提取
|
| 187 |
+
|
| 188 |
+
⚙️ === 批次 6/10 (文档 51-60) ===
|
| 189 |
+
|
| 190 |
+
🔍 文档 #51: 开始提取...
|
| 191 |
+
🔄 提取实体 (尝试 1/3)... ✅ 提取到 5 个实体
|
| 192 |
+
🔄 提取关系 (尝试 1/3)... ✅ 提取到 3 个关系
|
| 193 |
+
📊 文档 #51 完成: 5 实体, 3 关系
|
| 194 |
+
|
| 195 |
+
🔍 文档 #52: 开始提取...
|
| 196 |
+
🔄 提取实体 (尝试 1/3)... ⏱️ 超时
|
| 197 |
+
⏳ 等待 2 秒后重试...
|
| 198 |
+
🔄 提取实体 (尝试 2/3)... ✅ 提取到 7 个实体
|
| 199 |
+
🔄 提取关系 (尝试 1/3)... ✅ 提取到 4 个关系
|
| 200 |
+
📊 文档 #52 完成: 7 实体, 4 关系
|
| 201 |
+
|
| 202 |
+
✅ 批次 6/10 完成
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
## 如何使用修复后的代码
|
| 206 |
+
|
| 207 |
+
### 方法 1: 上传到Google Drive
|
| 208 |
+
|
| 209 |
+
1. 下载更新后的文件:
|
| 210 |
+
- `entity_extractor.py`
|
| 211 |
+
- `graph_indexer.py`
|
| 212 |
+
- `GRAPHRAG_TROUBLESHOOTING.md`
|
| 213 |
+
|
| 214 |
+
2. 上传到 `/MyDrive/adaptive_RAG/`
|
| 215 |
+
|
| 216 |
+
3. 重新运行 `main_graphrag.py`
|
| 217 |
+
|
| 218 |
+
### 方法 2: 在Colab中直接应用补丁
|
| 219 |
+
|
| 220 |
+
运行以下代码块:
|
| 221 |
+
|
| 222 |
+
```python
|
| 223 |
+
# 确保已挂载Google Drive
|
| 224 |
+
from google.colab import drive
|
| 225 |
+
drive.mount('/content/drive')
|
| 226 |
+
|
| 227 |
+
# 更新entity_extractor.py的超时设置
|
| 228 |
+
import sys
|
| 229 |
+
sys.path.insert(0, '/content/drive/MyDrive/adaptive_RAG')
|
| 230 |
+
|
| 231 |
+
# 重新导入更新后的模块
|
| 232 |
+
import importlib
|
| 233 |
+
if 'entity_extractor' in sys.modules:
|
| 234 |
+
importlib.reload(sys.modules['entity_extractor'])
|
| 235 |
+
if 'graph_indexer' in sys.modules:
|
| 236 |
+
importlib.reload(sys.modules['graph_indexer'])
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### 方法 3: 调整参数
|
| 240 |
+
|
| 241 |
+
如果仍然卡住,可以调整参数:
|
| 242 |
+
|
| 243 |
+
```python
|
| 244 |
+
# 在初始化时增加超时和重试
|
| 245 |
+
from entity_extractor import EntityExtractor
|
| 246 |
+
|
| 247 |
+
extractor = EntityExtractor(
|
| 248 |
+
timeout=120, # 增加到2分钟
|
| 249 |
+
max_retries=5 # 更多重试次数
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
# 减小批次大小
|
| 253 |
+
graph = indexer.index_documents(
|
| 254 |
+
documents=doc_splits,
|
| 255 |
+
batch_size=3, # 从10降到3
|
| 256 |
+
save_path="./knowledge_graph.pkl"
|
| 257 |
+
)
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
## 紧急修复步骤
|
| 261 |
+
|
| 262 |
+
如果现在就需要解决,按以下顺序尝试:
|
| 263 |
+
|
| 264 |
+
### ⚡ 步骤 1: 重启Ollama (最快)
|
| 265 |
+
```bash
|
| 266 |
+
# 在Colab中
|
| 267 |
+
!pkill -9 ollama
|
| 268 |
+
!sleep 2
|
| 269 |
+
!nohup ollama serve > /tmp/ollama.log 2>&1 &
|
| 270 |
+
!sleep 5
|
| 271 |
+
```
|
| 272 |
+
|
| 273 |
+
### ⚡ 步骤 2: 减小批次大小
|
| 274 |
+
```python
|
| 275 |
+
# 找到调用 index_documents 的地方,修改为:
|
| 276 |
+
batch_size=3 # 从默认的10改为3
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
### ⚡ 步骤 3: 从失败处继续
|
| 280 |
+
```python
|
| 281 |
+
# 如果在第6批次卡住,跳过前5批次
|
| 282 |
+
processed_count = 50 # 5批次 × 10文档/批次
|
| 283 |
+
remaining_docs = doc_splits[processed_count:]
|
| 284 |
+
|
| 285 |
+
# 只处理剩余的
|
| 286 |
+
graph = indexer.index_documents(
|
| 287 |
+
documents=remaining_docs,
|
| 288 |
+
batch_size=5
|
| 289 |
+
)
|
| 290 |
+
```
|
| 291 |
+
|
| 292 |
+
## 预防措施
|
| 293 |
+
|
| 294 |
+
### 1. 在开始大批量处理前测试
|
| 295 |
+
```python
|
| 296 |
+
# 先用小数据集测试
|
| 297 |
+
test_docs = doc_splits[:5]
|
| 298 |
+
test_graph = indexer.index_documents(test_docs, batch_size=2)
|
| 299 |
+
print("✅ 测试成功,可以处理完整数据集")
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
### 2. 定期保存检查点
|
| 303 |
+
```python
|
| 304 |
+
# 每5个批次保存一次
|
| 305 |
+
import pickle
|
| 306 |
+
|
| 307 |
+
for batch_num in range(total_batches):
|
| 308 |
+
# 处理批次...
|
| 309 |
+
|
| 310 |
+
if batch_num % 5 == 0:
|
| 311 |
+
checkpoint = {
|
| 312 |
+
'results': extraction_results,
|
| 313 |
+
'last_batch': batch_num
|
| 314 |
+
}
|
| 315 |
+
with open(f'/content/drive/MyDrive/checkpoint_{batch_num}.pkl', 'wb') as f:
|
| 316 |
+
pickle.dump(checkpoint, f)
|
| 317 |
+
```
|
| 318 |
+
|
| 319 |
+
### 3. 监控Ollama健康状态
|
| 320 |
+
```python
|
| 321 |
+
import requests
|
| 322 |
+
|
| 323 |
+
def check_ollama_health():
|
| 324 |
+
try:
|
| 325 |
+
response = requests.get('http://localhost:11434/api/tags', timeout=5)
|
| 326 |
+
return response.status_code == 200
|
| 327 |
+
except:
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
# 在批次循环中
|
| 331 |
+
if not check_ollama_health():
|
| 332 |
+
print("⚠️ Ollama服务异常,重启中...")
|
| 333 |
+
!pkill ollama && sleep 2 && nohup ollama serve > /tmp/ollama.log 2>&1 &
|
| 334 |
+
!sleep 5
|
| 335 |
+
```
|
| 336 |
+
|
| 337 |
+
## 修改的文件列表
|
| 338 |
+
|
| 339 |
+
| 文件 | 修改内容 | 影响 |
|
| 340 |
+
|-----|---------|------|
|
| 341 |
+
| `entity_extractor.py` | 添加timeout、重试、详细日志 | 核心修复 |
|
| 342 |
+
| `graph_indexer.py` | 批次级异常处理、进度跟踪 | 核心修复 |
|
| 343 |
+
| `GRAPHRAG_TROUBLESHOOTING.md` | 完整的故障排除指南 | 新增文档 |
|
| 344 |
+
| `BATCH_FREEZE_FIX.md` | 本文档 | 新增文档 |
|
| 345 |
+
|
| 346 |
+
## 技术细节
|
| 347 |
+
|
| 348 |
+
### Timeout实现
|
| 349 |
+
- 使用 `ChatOllama(timeout=60)` 参数
|
| 350 |
+
- 捕获 `TimeoutError` 异常
|
| 351 |
+
- 实现指数退避重试策略
|
| 352 |
+
|
| 353 |
+
### 异常恢复策略
|
| 354 |
+
1. **轻度错误**: 重试3次,间隔递增
|
| 355 |
+
2. **严重错误**: 记录并跳过,返回空结果
|
| 356 |
+
3. **批次失败**: 继续处理下一批次
|
| 357 |
+
|
| 358 |
+
### 进度持久化
|
| 359 |
+
- 可以实现检查点保存
|
| 360 |
+
- 支持从任意批次恢复
|
| 361 |
+
- 避免重复处理
|
| 362 |
+
|
| 363 |
+
## 预期效果
|
| 364 |
+
|
| 365 |
+
实施这些修复后:
|
| 366 |
+
- ✅ **不会再卡住**: 超时后自动重试或跳过
|
| 367 |
+
- ✅ **更清晰的进度**: 知道当前处理到哪个文档
|
| 368 |
+
- ✅ **更好的容错性**: 单个文档失败不影响整体
|
| 369 |
+
- ✅ **易于诊断**: 详细日志帮助快速定位问题
|
| 370 |
+
|
| 371 |
+
## 性能影响
|
| 372 |
+
|
| 373 |
+
- **正常情况**: 几乎无影响,只是多了日志输出
|
| 374 |
+
- **超时情况**: 会重试,总时间略增加(但比卡住强)
|
| 375 |
+
- **失败情况**: 跳过失败文档,整体速度更快
|
| 376 |
+
|
| 377 |
+
## 下一步
|
| 378 |
+
|
| 379 |
+
1. **立即**: 上传修复后的文件到Google Drive
|
| 380 |
+
2. **测试**: 先用小数据集(5-10个文档)测试
|
| 381 |
+
3. **运行**: 使用完整数据集,batch_size从小到大调整
|
| 382 |
+
4. **监控**: 观察日志输出,记录任何异常
|
| 383 |
+
5. **优化**: 根据实际情况调整timeout和batch_size
|
| 384 |
+
|
| 385 |
+
## 联系信息
|
| 386 |
+
|
| 387 |
+
如果问题仍然存在,请提供:
|
| 388 |
+
- 完整的日志输出(特别是卡住前的最后几行)
|
| 389 |
+
- 文档数量和批次大小
|
| 390 |
+
- Ollama版本和模型名称
|
| 391 |
+
- 系统资源使用情况(内存、GPU)
|
| 392 |
+
|
| 393 |
+
---
|
| 394 |
+
|
| 395 |
+
**总结**: 问题已通过添加超时控制、重试机制和完善的异常处理得到解决。现在的代码能够优雅地处理LLM超时和失败,并提供详细的进度反馈。
|
GRAPHRAG_TROUBLESHOOTING.md
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GraphRAG 故障排除指南
|
| 2 |
+
|
| 3 |
+
## 问题:处理批次时卡住不动
|
| 4 |
+
|
| 5 |
+
### 症状
|
| 6 |
+
- 处理到第6个批次时,实体提取后程序卡住
|
| 7 |
+
- 没有错误信息,只是停止响应
|
| 8 |
+
- CPU/GPU使用率下降到0
|
| 9 |
+
|
| 10 |
+
### 根本原因
|
| 11 |
+
|
| 12 |
+
#### 1. **LLM超时问题** ⏱️
|
| 13 |
+
- **原因**: Ollama服务可能在处理复杂请求时超时
|
| 14 |
+
- **表现**: 请求挂起,没有响应也没有错误
|
| 15 |
+
- **解决方案**: 已添加timeout参数和重试机制
|
| 16 |
+
|
| 17 |
+
#### 2. **内存泄漏** 💾
|
| 18 |
+
- **原因**: 多次LLM调用后,Ollama可能积累内存
|
| 19 |
+
- **表现**: 响应变慢,最终完全停止
|
| 20 |
+
- **解决方案**:
|
| 21 |
+
```bash
|
| 22 |
+
# 重启Ollama服务
|
| 23 |
+
pkill ollama
|
| 24 |
+
ollama serve
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
#### 3. **连接池耗尽** 🔌
|
| 28 |
+
- **原因**: 太多并发请求,没有正确关闭连接
|
| 29 |
+
- **表现**: 新请求无法建立连接
|
| 30 |
+
- **解决方案**: 已添加重试延迟和异常处理
|
| 31 |
+
|
| 32 |
+
#### 4. **文档内容过长** 📄
|
| 33 |
+
- **原因**: 某些文档chunk可能超过LLM的上下文窗口
|
| 34 |
+
- **表现**: LLM静默失败
|
| 35 |
+
- **解决方案**: 已限制为2000字符
|
| 36 |
+
|
| 37 |
+
## 已实施的修复
|
| 38 |
+
|
| 39 |
+
### 1. 添加超时控制
|
| 40 |
+
```python
|
| 41 |
+
EntityExtractor(timeout=60, max_retries=3)
|
| 42 |
+
```
|
| 43 |
+
- 每次LLM调用最多60秒超时
|
| 44 |
+
- 失败后最多重试3次
|
| 45 |
+
- 重试间隔递增(2s, 4s, 6s)
|
| 46 |
+
|
| 47 |
+
### 2. 改进的错误处理
|
| 48 |
+
```python
|
| 49 |
+
try:
|
| 50 |
+
result = extractor.extract_from_document(...)
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"❌ 文档处理失败: {e}")
|
| 53 |
+
extraction_results.append({"entities": [], "relations": []})
|
| 54 |
+
```
|
| 55 |
+
- 捕获所有异常
|
| 56 |
+
- 添加空结果而不是崩溃
|
| 57 |
+
- 继续处理下一个文档
|
| 58 |
+
|
| 59 |
+
### 3. 详细的进度日志
|
| 60 |
+
```
|
| 61 |
+
⚙️ === 批次 6/10 (文档 51-60) ===
|
| 62 |
+
🔍 文档 #51: 开始提取...
|
| 63 |
+
🔄 提取实体 (尝试 1/3)... ✅ 提取到 5 个实体
|
| 64 |
+
🔄 提取关系 (尝试 1/3)... ✅ 提取到 3 个关系
|
| 65 |
+
📊 文档 #51 完成: 5 实体, 3 关系
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## 故障排除步骤
|
| 69 |
+
|
| 70 |
+
### 步骤 1: 检查Ollama服务状态
|
| 71 |
+
```bash
|
| 72 |
+
# 检查Ollama是否运行
|
| 73 |
+
ps aux | grep ollama
|
| 74 |
+
|
| 75 |
+
# 查看Ollama日志
|
| 76 |
+
tail -f ~/.ollama/logs/server.log
|
| 77 |
+
|
| 78 |
+
# 检查模型是否加载
|
| 79 |
+
ollama list
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
### 步骤 2: 检查系统资源
|
| 83 |
+
```bash
|
| 84 |
+
# 内存使用
|
| 85 |
+
free -h # Linux
|
| 86 |
+
top # 查看Ollama进程
|
| 87 |
+
|
| 88 |
+
# 在Colab中
|
| 89 |
+
!nvidia-smi # GPU内存
|
| 90 |
+
!ps aux | grep ollama
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### 步骤 3: 减小批次大小
|
| 94 |
+
```python
|
| 95 |
+
# 在 main_graphrag.py 或调用代码中
|
| 96 |
+
graph = indexer.index_documents(
|
| 97 |
+
documents=doc_splits,
|
| 98 |
+
batch_size=5, # 从10降到5
|
| 99 |
+
save_path="./knowledge_graph.pkl"
|
| 100 |
+
)
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
### 步骤 4: 测试单个文档
|
| 104 |
+
```python
|
| 105 |
+
# 测试提取器是否工作
|
| 106 |
+
from entity_extractor import EntityExtractor
|
| 107 |
+
|
| 108 |
+
extractor = EntityExtractor(timeout=30, max_retries=2)
|
| 109 |
+
result = extractor.extract_from_document(
|
| 110 |
+
"测试文本...",
|
| 111 |
+
doc_index=0
|
| 112 |
+
)
|
| 113 |
+
print(result)
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### 步骤 5: 重启Ollama服务
|
| 117 |
+
```bash
|
| 118 |
+
# 完全重启Ollama
|
| 119 |
+
pkill -9 ollama
|
| 120 |
+
sleep 2
|
| 121 |
+
ollama serve &
|
| 122 |
+
|
| 123 |
+
# 等待服务启动
|
| 124 |
+
sleep 5
|
| 125 |
+
|
| 126 |
+
# 验证服务
|
| 127 |
+
curl http://localhost:11434/api/tags
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
## 性能优化建议
|
| 131 |
+
|
| 132 |
+
### 1. 调整超时参数
|
| 133 |
+
```python
|
| 134 |
+
# 对于较慢的机器或GPU
|
| 135 |
+
extractor = EntityExtractor(
|
| 136 |
+
timeout=120, # 增加到2分钟
|
| 137 |
+
max_retries=5 # 更多重试次数
|
| 138 |
+
)
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
### 2. 使用更小的模型
|
| 142 |
+
```python
|
| 143 |
+
# 在 config.py 中
|
| 144 |
+
LOCAL_LLM = "mistral:7b" # 默认
|
| 145 |
+
# 改为
|
| 146 |
+
LOCAL_LLM = "llama2:7b" # 更快
|
| 147 |
+
# 或
|
| 148 |
+
LOCAL_LLM = "phi:latest" # 最快,但质量较低
|
| 149 |
+
```
|
| 150 |
+
|
| 151 |
+
### 3. 增加批次间延迟
|
| 152 |
+
```python
|
| 153 |
+
# 在 graph_indexer.py 中,批次循环后添加
|
| 154 |
+
import time
|
| 155 |
+
for i in range(0, len(documents), batch_size):
|
| 156 |
+
# ... 处理批次 ...
|
| 157 |
+
time.sleep(2) # 给Ollama 2秒恢复时间
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
### 4. 限制并发请求
|
| 161 |
+
```python
|
| 162 |
+
# 使用线程池控制并发
|
| 163 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 164 |
+
|
| 165 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 166 |
+
futures = [executor.submit(extract, doc) for doc in batch]
|
| 167 |
+
results = [f.result() for f in futures]
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
## 在Google Colab中的特殊问题
|
| 171 |
+
|
| 172 |
+
### 问题: Colab会话超时
|
| 173 |
+
**解决方案**: 使用checkpoint保存进度
|
| 174 |
+
```python
|
| 175 |
+
# 每处理N个批次保存一次
|
| 176 |
+
if batch_num % 5 == 0:
|
| 177 |
+
checkpoint = {
|
| 178 |
+
'extraction_results': extraction_results,
|
| 179 |
+
'processed_docs': i + len(batch)
|
| 180 |
+
}
|
| 181 |
+
import pickle
|
| 182 |
+
with open(f'/content/drive/MyDrive/checkpoint_{batch_num}.pkl', 'wb') as f:
|
| 183 |
+
pickle.dump(checkpoint, f)
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
### 问题: Ollama内存不足
|
| 187 |
+
**解决方案**: 在Colab中设置较小的上下文窗口
|
| 188 |
+
```python
|
| 189 |
+
# 启动Ollama时
|
| 190 |
+
!OLLAMA_NUM_GPU=1 OLLAMA_MAX_LOADED_MODELS=1 ollama serve > /tmp/ollama.log 2>&1 &
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
## 监控和调试
|
| 194 |
+
|
| 195 |
+
### 添加详细日志
|
| 196 |
+
```python
|
| 197 |
+
import logging
|
| 198 |
+
|
| 199 |
+
logging.basicConfig(
|
| 200 |
+
level=logging.DEBUG,
|
| 201 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 202 |
+
handlers=[
|
| 203 |
+
logging.FileHandler('graphrag_debug.log'),
|
| 204 |
+
logging.StreamHandler()
|
| 205 |
+
]
|
| 206 |
+
)
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### 使用超时上下文管理器
|
| 210 |
+
```python
|
| 211 |
+
import signal
|
| 212 |
+
from contextlib import contextmanager
|
| 213 |
+
|
| 214 |
+
@contextmanager
|
| 215 |
+
def timeout(seconds):
|
| 216 |
+
def handler(signum, frame):
|
| 217 |
+
raise TimeoutError()
|
| 218 |
+
signal.signal(signal.SIGALRM, handler)
|
| 219 |
+
signal.alarm(seconds)
|
| 220 |
+
try:
|
| 221 |
+
yield
|
| 222 |
+
finally:
|
| 223 |
+
signal.alarm(0)
|
| 224 |
+
|
| 225 |
+
# 使用
|
| 226 |
+
with timeout(60):
|
| 227 |
+
result = extractor.extract_from_document(text)
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
## 常见错误信息
|
| 231 |
+
|
| 232 |
+
| 错误信息 | 原因 | 解决方案 |
|
| 233 |
+
|---------|------|---------|
|
| 234 |
+
| `Connection refused` | Ollama未运行 | `ollama serve` |
|
| 235 |
+
| `Timeout` | LLM响应慢 | 增加timeout参数 |
|
| 236 |
+
| `CUDA out of memory` | GPU内存不足 | 减小batch_size |
|
| 237 |
+
| `JSON decode error` | LLM输出格式错误 | 检查prompt模板 |
|
| 238 |
+
| 卡住无输出 | LLM挂起 | 重启Ollama,添加超时 |
|
| 239 |
+
|
| 240 |
+
## 快速修复清单
|
| 241 |
+
|
| 242 |
+
✅ **立即尝试这些步骤**:
|
| 243 |
+
|
| 244 |
+
1. **重启Ollama**
|
| 245 |
+
```bash
|
| 246 |
+
pkill ollama && sleep 2 && ollama serve &
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
2. **减小批次大小**
|
| 250 |
+
```python
|
| 251 |
+
batch_size=3 # 从10改为3
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
3. **增加超时时间**
|
| 255 |
+
```python
|
| 256 |
+
EntityExtractor(timeout=120, max_retries=5)
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
4. **检查第6个文档**
|
| 260 |
+
```python
|
| 261 |
+
# 单独处理第6个文档看是否有特殊问题
|
| 262 |
+
doc_6 = documents[5]
|
| 263 |
+
print(f"文档长度: {len(doc_6.page_content)}")
|
| 264 |
+
print(f"前500字符: {doc_6.page_content[:500]}")
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
5. **使用检查点恢复**
|
| 268 |
+
```python
|
| 269 |
+
# 从第6批次重新开始
|
| 270 |
+
start_index = 50 # 跳过前5批次
|
| 271 |
+
documents_remaining = documents[start_index:]
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
## 预防措施
|
| 275 |
+
|
| 276 |
+
1. **开始前验证环境**
|
| 277 |
+
```bash
|
| 278 |
+
# 检查所有依赖
|
| 279 |
+
python colab_install_deps.py
|
| 280 |
+
|
| 281 |
+
# 测试Ollama
|
| 282 |
+
ollama list
|
| 283 |
+
ollama run mistral "Hello"
|
| 284 |
+
```
|
| 285 |
+
|
| 286 |
+
2. **使用小数据集测试**
|
| 287 |
+
```python
|
| 288 |
+
# 先用5个文档测试
|
| 289 |
+
test_docs = doc_splits[:5]
|
| 290 |
+
graph = indexer.index_documents(test_docs, batch_size=2)
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
3. **监控资源使用**
|
| 294 |
+
```python
|
| 295 |
+
import psutil
|
| 296 |
+
print(f"内存使用: {psutil.virtual_memory().percent}%")
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
## 获取帮助
|
| 300 |
+
|
| 301 |
+
如果问题持续,请提供以下信息:
|
| 302 |
+
|
| 303 |
+
1. **系统信息**
|
| 304 |
+
- OS版本
|
| 305 |
+
- Python版本
|
| 306 |
+
- Ollama版本
|
| 307 |
+
- 可用内存/GPU
|
| 308 |
+
|
| 309 |
+
2. **错误日志**
|
| 310 |
+
- 最后一条成功的输出
|
| 311 |
+
- 完整的错误堆栈
|
| 312 |
+
- Ollama日志 (`~/.ollama/logs/server.log`)
|
| 313 |
+
|
| 314 |
+
3. **复现步骤**
|
| 315 |
+
- 文档数量
|
| 316 |
+
- batch_size
|
| 317 |
+
- 在哪个批次卡住
|
| 318 |
+
|
| 319 |
+
## 总结
|
| 320 |
+
|
| 321 |
+
**最可能的原因**: LLM调用超时或Ollama内存积累
|
| 322 |
+
|
| 323 |
+
**最快的解决方案**:
|
| 324 |
+
1. 重启Ollama服务
|
| 325 |
+
2. 减小batch_size到3-5
|
| 326 |
+
3. 使用更新后的带超时和重试的代码
|
| 327 |
+
|
| 328 |
+
现在的代码已经包含了所有这些保护措施,应该能够稳定运行!
|
entity_extractor.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from typing import List, Dict, Tuple
|
|
|
|
| 7 |
try:
|
| 8 |
from langchain_core.prompts import PromptTemplate
|
| 9 |
except ImportError:
|
|
@@ -17,8 +18,20 @@ from config import LOCAL_LLM
|
|
| 17 |
class EntityExtractor:
|
| 18 |
"""实体提取器 - 使用LLM从文本中提取实体"""
|
| 19 |
|
| 20 |
-
def __init__(self):
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# 实体提取提示模板
|
| 24 |
self.entity_prompt = PromptTemplate(
|
|
@@ -85,7 +98,7 @@ class EntityExtractor:
|
|
| 85 |
|
| 86 |
def extract_entities(self, text: str) -> List[Dict]:
|
| 87 |
"""
|
| 88 |
-
|
| 89 |
|
| 90 |
Args:
|
| 91 |
text: 输入文本
|
|
@@ -93,18 +106,34 @@ class EntityExtractor:
|
|
| 93 |
Returns:
|
| 94 |
实体列表
|
| 95 |
"""
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
def extract_relations(self, text: str, entities: List[Dict]) -> List[Dict]:
|
| 106 |
"""
|
| 107 |
-
|
| 108 |
|
| 109 |
Args:
|
| 110 |
text: 输入文本
|
|
@@ -113,35 +142,57 @@ class EntityExtractor:
|
|
| 113 |
Returns:
|
| 114 |
关系列表
|
| 115 |
"""
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
result = self.relation_chain.invoke({
|
| 119 |
-
"text": text[:2000],
|
| 120 |
-
"entities": ", ".join(entity_names)
|
| 121 |
-
})
|
| 122 |
-
relations = result.get("relations", [])
|
| 123 |
-
print(f"✅ 提取到 {len(relations)} 个关系")
|
| 124 |
-
return relations
|
| 125 |
-
except Exception as e:
|
| 126 |
-
print(f"❌ 关系提取失败: {e}")
|
| 127 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
def extract_from_document(self, document_text: str) -> Dict:
|
| 130 |
"""
|
| 131 |
从单个文档中提取实体和关系
|
| 132 |
|
| 133 |
Args:
|
| 134 |
document_text: 文档文本
|
|
|
|
| 135 |
|
| 136 |
Returns:
|
| 137 |
包含实体和关系的字典
|
| 138 |
"""
|
| 139 |
-
print("🔍
|
| 140 |
-
entities = self.extract_entities(document_text)
|
| 141 |
|
| 142 |
-
|
| 143 |
relations = self.extract_relations(document_text, entities)
|
| 144 |
|
|
|
|
|
|
|
| 145 |
return {
|
| 146 |
"entities": entities,
|
| 147 |
"relations": relations
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
from typing import List, Dict, Tuple
|
| 7 |
+
import time
|
| 8 |
try:
|
| 9 |
from langchain_core.prompts import PromptTemplate
|
| 10 |
except ImportError:
|
|
|
|
| 18 |
class EntityExtractor:
|
| 19 |
"""实体提取器 - 使用LLM从文本中提取实体"""
|
| 20 |
|
| 21 |
+
def __init__(self, timeout: int = 60, max_retries: int = 3):
|
| 22 |
+
"""初始化实体提取器
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
timeout: LLM调用超时时间(秒)
|
| 26 |
+
max_retries: 失败重试次数
|
| 27 |
+
"""
|
| 28 |
+
self.llm = ChatOllama(
|
| 29 |
+
model=LOCAL_LLM,
|
| 30 |
+
format="json",
|
| 31 |
+
temperature=0,
|
| 32 |
+
timeout=timeout # 添加超时设置
|
| 33 |
+
)
|
| 34 |
+
self.max_retries = max_retries
|
| 35 |
|
| 36 |
# 实体提取提示模板
|
| 37 |
self.entity_prompt = PromptTemplate(
|
|
|
|
| 98 |
|
| 99 |
def extract_entities(self, text: str) -> List[Dict]:
|
| 100 |
"""
|
| 101 |
+
从文本中提取实体(带重试机制)
|
| 102 |
|
| 103 |
Args:
|
| 104 |
text: 输入文本
|
|
|
|
| 106 |
Returns:
|
| 107 |
实体列表
|
| 108 |
"""
|
| 109 |
+
for attempt in range(self.max_retries):
|
| 110 |
+
try:
|
| 111 |
+
print(f" 🔄 提取实体 (尝试 {attempt + 1}/{self.max_retries})...", end="")
|
| 112 |
+
result = self.entity_chain.invoke({"text": text[:2000]}) # 限制长度
|
| 113 |
+
entities = result.get("entities", [])
|
| 114 |
+
print(f" ✅ 提取到 {len(entities)} 个实体")
|
| 115 |
+
return entities
|
| 116 |
+
except TimeoutError as e:
|
| 117 |
+
print(f" ⏱️ 超时")
|
| 118 |
+
if attempt < self.max_retries - 1:
|
| 119 |
+
wait_time = (attempt + 1) * 2
|
| 120 |
+
print(f" ⏳ 等待 {wait_time} 秒后重试...")
|
| 121 |
+
time.sleep(wait_time)
|
| 122 |
+
else:
|
| 123 |
+
print(f" ❌ 实体提取最终失败: 超时")
|
| 124 |
+
return []
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f" ❌ 错误: {str(e)[:100]}")
|
| 127 |
+
if attempt < self.max_retries - 1:
|
| 128 |
+
time.sleep(1)
|
| 129 |
+
else:
|
| 130 |
+
print(f" ❌ 实体提取最终失败: {e}")
|
| 131 |
+
return []
|
| 132 |
+
return []
|
| 133 |
|
| 134 |
def extract_relations(self, text: str, entities: List[Dict]) -> List[Dict]:
|
| 135 |
"""
|
| 136 |
+
从文本中提取实体关系(带重试机制)
|
| 137 |
|
| 138 |
Args:
|
| 139 |
text: 输入文本
|
|
|
|
| 142 |
Returns:
|
| 143 |
关系列表
|
| 144 |
"""
|
| 145 |
+
if not entities:
|
| 146 |
+
print(" ⚠️ 无实体,跳过关系提取")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
return []
|
| 148 |
+
|
| 149 |
+
for attempt in range(self.max_retries):
|
| 150 |
+
try:
|
| 151 |
+
print(f" 🔄 提取关系 (尝试 {attempt + 1}/{self.max_retries})...", end="")
|
| 152 |
+
entity_names = [e["name"] for e in entities]
|
| 153 |
+
result = self.relation_chain.invoke({
|
| 154 |
+
"text": text[:2000],
|
| 155 |
+
"entities": ", ".join(entity_names)
|
| 156 |
+
})
|
| 157 |
+
relations = result.get("relations", [])
|
| 158 |
+
print(f" ✅ 提取到 {len(relations)} 个关系")
|
| 159 |
+
return relations
|
| 160 |
+
except TimeoutError as e:
|
| 161 |
+
print(f" ⏱️ 超时")
|
| 162 |
+
if attempt < self.max_retries - 1:
|
| 163 |
+
wait_time = (attempt + 1) * 2
|
| 164 |
+
print(f" ⏳ 等待 {wait_time} 秒后重试...")
|
| 165 |
+
time.sleep(wait_time)
|
| 166 |
+
else:
|
| 167 |
+
print(f" ❌ 关系提取最终失败: 超时")
|
| 168 |
+
return []
|
| 169 |
+
except Exception as e:
|
| 170 |
+
print(f" ❌ 错误: {str(e)[:100]}")
|
| 171 |
+
if attempt < self.max_retries - 1:
|
| 172 |
+
time.sleep(1)
|
| 173 |
+
else:
|
| 174 |
+
print(f" ❌ 关系提取最终失败: {e}")
|
| 175 |
+
return []
|
| 176 |
+
return []
|
| 177 |
|
| 178 |
+
def extract_from_document(self, document_text: str, doc_index: int = 0) -> Dict:
|
| 179 |
"""
|
| 180 |
从单个文档中提取实体和关系
|
| 181 |
|
| 182 |
Args:
|
| 183 |
document_text: 文档文本
|
| 184 |
+
doc_index: 文档索引(用于日志)
|
| 185 |
|
| 186 |
Returns:
|
| 187 |
包含实体和关系的字典
|
| 188 |
"""
|
| 189 |
+
print(f"\n🔍 文档 #{doc_index + 1}: 开始提取...")
|
|
|
|
| 190 |
|
| 191 |
+
entities = self.extract_entities(document_text)
|
| 192 |
relations = self.extract_relations(document_text, entities)
|
| 193 |
|
| 194 |
+
print(f"📊 文档 #{doc_index + 1} 完成: {len(entities)} 实体, {len(relations)} 关系")
|
| 195 |
+
|
| 196 |
return {
|
| 197 |
"entities": entities,
|
| 198 |
"relations": relations
|
graph_indexer.py
CHANGED
|
@@ -58,14 +58,27 @@ class GraphRAGIndexer:
|
|
| 58 |
# 步骤1: 实体和关系提取
|
| 59 |
print("📍 步骤 1/5: 实体和关系提取")
|
| 60 |
extraction_results = []
|
|
|
|
| 61 |
|
| 62 |
for i in range(0, len(documents), batch_size):
|
| 63 |
batch = documents[i:i+batch_size]
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
-
for doc in batch:
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
# 步骤2: 实体去重
|
| 71 |
print("\n📍 步骤 2/5: 实体去重和合并")
|
|
|
|
| 58 |
# 步骤1: 实体和关系提取
|
| 59 |
print("📍 步骤 1/5: 实体和关系提取")
|
| 60 |
extraction_results = []
|
| 61 |
+
total_batches = (len(documents) - 1) // batch_size + 1
|
| 62 |
|
| 63 |
for i in range(0, len(documents), batch_size):
|
| 64 |
batch = documents[i:i+batch_size]
|
| 65 |
+
batch_num = i // batch_size + 1
|
| 66 |
+
print(f"\n⚙️ === 批次 {batch_num}/{total_batches} (文档 {i+1}-{min(i+batch_size, len(documents))}) ===")
|
| 67 |
|
| 68 |
+
for idx, doc in enumerate(batch):
|
| 69 |
+
doc_global_index = i + idx
|
| 70 |
+
try:
|
| 71 |
+
result = self.entity_extractor.extract_from_document(
|
| 72 |
+
doc.page_content,
|
| 73 |
+
doc_index=doc_global_index
|
| 74 |
+
)
|
| 75 |
+
extraction_results.append(result)
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f" ❌ 文档 #{doc_global_index + 1} 处理失败: {e}")
|
| 78 |
+
# 添加空结果以保持索引一致
|
| 79 |
+
extraction_results.append({"entities": [], "relations": []})
|
| 80 |
+
|
| 81 |
+
print(f"✅ 批次 {batch_num}/{total_batches} 完成")
|
| 82 |
|
| 83 |
# 步骤2: 实体去重
|
| 84 |
print("\n📍 步骤 2/5: 实体去重和合并")
|