Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
c844813
1
Parent(s):
63ec70e
resolve conflict
Browse files- KAGGLE_OPTIMIZATION_GUIDE.md +367 -0
- KAGGLE_QUICK_START.py +197 -0
- colab_setup_and_run.py +0 -375
- config.py +7 -1
- setup_and_run.py +112 -0
KAGGLE_OPTIMIZATION_GUIDE.md
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Kaggle 环境优化指南 - 避免重复下载模型
|
| 2 |
+
|
| 3 |
+
## 🚨 问题
|
| 4 |
+
|
| 5 |
+
每次 Kaggle 会话重启后,Ollama 模型需要重新下载,Mistral 模型约 4GB,非常耗时。
|
| 6 |
+
|
| 7 |
+
## 💡 解决方案
|
| 8 |
+
|
| 9 |
+
### 方案 1: 使用更小的模型(推荐⭐⭐⭐⭐⭐)
|
| 10 |
+
|
| 11 |
+
**最佳选择**:不需要修改代码,只需在下载模型时选择更小的版本。
|
| 12 |
+
|
| 13 |
+
#### 可选模型对比
|
| 14 |
+
|
| 15 |
+
| 模型 | 大小 | 下载时间 | 质量 | 推荐场景 |
|
| 16 |
+
|-----|------|---------|------|---------|
|
| 17 |
+
| `mistral` | ~4GB | 5-10分钟 | ⭐⭐⭐⭐⭐ | 本地开发 |
|
| 18 |
+
| `phi` | ~1.6GB | 2-3分钟 | ⭐⭐⭐⭐ | **Kaggle推荐** |
|
| 19 |
+
| `tinyllama` | ~600MB | 1分钟 | ⭐⭐⭐ | 快速测试 |
|
| 20 |
+
| `qwen:0.5b` | ~350MB | 30秒 | ⭐⭐ | 极速测试 |
|
| 21 |
+
|
| 22 |
+
#### 使用方法
|
| 23 |
+
|
| 24 |
+
**选项 A**: 修改 `config.py`
|
| 25 |
+
```python
|
| 26 |
+
# 在 /kaggle/working/adaptive_RAG/config.py 中
|
| 27 |
+
LOCAL_LLM = "phi" # 👈 改为 phi 或 tinyllama
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
**选项 B**: 运行时覆盖(不修改代码)
|
| 31 |
+
```python
|
| 32 |
+
# 在 Kaggle Notebook 中
|
| 33 |
+
import os
|
| 34 |
+
os.environ['LOCAL_LLM_OVERRIDE'] = 'phi'
|
| 35 |
+
|
| 36 |
+
# 然后正常导入
|
| 37 |
+
from config import LOCAL_LLM
|
| 38 |
+
# LOCAL_LLM 会自动使用 'phi'
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
**选项 C**: 直接在下载时指定
|
| 42 |
+
```python
|
| 43 |
+
# 下载更小的模型
|
| 44 |
+
!ollama pull phi # 代替 mistral
|
| 45 |
+
|
| 46 |
+
# 或者
|
| 47 |
+
!ollama pull tinyllama
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
### 方案 2: 持久化模型到 Kaggle Dataset(中等推荐⭐⭐⭐)
|
| 53 |
+
|
| 54 |
+
将下载好的模型保存为 Dataset,下次会话直接加载。
|
| 55 |
+
|
| 56 |
+
#### 步骤
|
| 57 |
+
|
| 58 |
+
**会话 1(首次):**
|
| 59 |
+
```python
|
| 60 |
+
import subprocess
|
| 61 |
+
import shutil
|
| 62 |
+
import os
|
| 63 |
+
|
| 64 |
+
# 1. 下载模型
|
| 65 |
+
subprocess.run(['ollama', 'pull', 'phi'])
|
| 66 |
+
|
| 67 |
+
# 2. 找到模型存储位置
|
| 68 |
+
# Ollama 模型通常存储在 ~/.ollama/models
|
| 69 |
+
ollama_models = os.path.expanduser('~/.ollama/models')
|
| 70 |
+
|
| 71 |
+
# 3. 复制到工作目录(会被保存为输出)
|
| 72 |
+
if os.path.exists(ollama_models):
|
| 73 |
+
shutil.copytree(
|
| 74 |
+
ollama_models,
|
| 75 |
+
'/kaggle/working/ollama_models',
|
| 76 |
+
dirs_exist_ok=True
|
| 77 |
+
)
|
| 78 |
+
print("✅ 模型已复制到 /kaggle/working/ollama_models")
|
| 79 |
+
print("📌 会话结束后,将此目录保存为 Dataset")
|
| 80 |
+
|
| 81 |
+
# 4. 会话结束时:Save Version → Save as Dataset
|
| 82 |
+
# 命名为: ollama-models-cache
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
**会话 2(后续):**
|
| 86 |
+
```python
|
| 87 |
+
import shutil
|
| 88 |
+
import os
|
| 89 |
+
|
| 90 |
+
# 1. 从 Dataset 恢复模型
|
| 91 |
+
models_cache = '/kaggle/input/ollama-models-cache'
|
| 92 |
+
|
| 93 |
+
if os.path.exists(models_cache):
|
| 94 |
+
print("📥 恢复 Ollama 模型...")
|
| 95 |
+
|
| 96 |
+
# 创建 Ollama 模型目录
|
| 97 |
+
ollama_dir = os.path.expanduser('~/.ollama/models')
|
| 98 |
+
os.makedirs(ollama_dir, exist_ok=True)
|
| 99 |
+
|
| 100 |
+
# 复制模型文件
|
| 101 |
+
shutil.copytree(
|
| 102 |
+
models_cache,
|
| 103 |
+
ollama_dir,
|
| 104 |
+
dirs_exist_ok=True
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
print("✅ 模型已恢复,无需重新下载!")
|
| 108 |
+
else:
|
| 109 |
+
print("⚠️ 未找到缓存,需要重新下载")
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
**注意**:此方法有局限性,因为 Ollama 的模型存储结构复杂,可能不完全兼容。
|
| 113 |
+
|
| 114 |
+
---
|
| 115 |
+
|
| 116 |
+
### 方案 3: 使用云端 LLM API(高级方案⭐⭐⭐⭐)
|
| 117 |
+
|
| 118 |
+
完全避免本地模型,使用云端 API。
|
| 119 |
+
|
| 120 |
+
#### 可选 API
|
| 121 |
+
|
| 122 |
+
1. **OpenAI API**(需付费)
|
| 123 |
+
2. **Anthropic Claude API**(需付费)
|
| 124 |
+
3. **Hugging Face Inference API**(免费,有限额)
|
| 125 |
+
4. **Together AI**(免费额度)
|
| 126 |
+
|
| 127 |
+
#### 代码修改示例
|
| 128 |
+
|
| 129 |
+
修改 `entity_extractor.py`:
|
| 130 |
+
|
| 131 |
+
```python
|
| 132 |
+
# 原代码
|
| 133 |
+
from langchain_community.chat_models import ChatOllama
|
| 134 |
+
self.llm = ChatOllama(model=LOCAL_LLM, format="json", temperature=0)
|
| 135 |
+
|
| 136 |
+
# 改为使用 OpenAI API
|
| 137 |
+
from langchain_openai import ChatOpenAI
|
| 138 |
+
self.llm = ChatOpenAI(
|
| 139 |
+
model="gpt-3.5-turbo", # 或 gpt-4
|
| 140 |
+
temperature=0,
|
| 141 |
+
openai_api_key=os.getenv("OPENAI_API_KEY")
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
# 或使用 Hugging Face
|
| 145 |
+
from langchain_community.llms import HuggingFaceHub
|
| 146 |
+
self.llm = HuggingFaceHub(
|
| 147 |
+
repo_id="mistralai/Mistral-7B-Instruct-v0.1",
|
| 148 |
+
huggingfacehub_api_token=os.getenv("HUGGINGFACE_API_TOKEN")
|
| 149 |
+
)
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
**优点**:
|
| 153 |
+
- ✅ 无需下载模型
|
| 154 |
+
- ✅ 速度快(云端 GPU)
|
| 155 |
+
- ✅ 质量好(GPT-4 等高级模型)
|
| 156 |
+
|
| 157 |
+
**缺点**:
|
| 158 |
+
- ❌ 需要 API Key
|
| 159 |
+
- ❌ 可能产生费用
|
| 160 |
+
- ❌ 依赖网络
|
| 161 |
+
|
| 162 |
+
---
|
| 163 |
+
|
| 164 |
+
### 方案 4: 预构建 Docker 镜像(技术方案⭐⭐)
|
| 165 |
+
|
| 166 |
+
创建包含预下载模型的 Docker 镜像。
|
| 167 |
+
|
| 168 |
+
**步骤**:
|
| 169 |
+
1. 本地构建包含 Ollama + 模型的 Docker 镜像
|
| 170 |
+
2. 推送到 Docker Hub
|
| 171 |
+
3. 在 Kaggle 中拉取该镜像
|
| 172 |
+
|
| 173 |
+
**局限**:Kaggle 对 Docker 支持有限。
|
| 174 |
+
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
## 🎯 最佳实践推荐
|
| 178 |
+
|
| 179 |
+
### 推荐组合策略
|
| 180 |
+
|
| 181 |
+
**快速开发/测试**:
|
| 182 |
+
```python
|
| 183 |
+
# 使用 phi 模型(平衡速度和质量)
|
| 184 |
+
LOCAL_LLM = "phi"
|
| 185 |
+
```
|
| 186 |
+
|
| 187 |
+
**生产环境**:
|
| 188 |
+
```python
|
| 189 |
+
# 使用云端 API(速度快、质量高)
|
| 190 |
+
# 在 Kaggle Secrets 中设置 OPENAI_API_KEY
|
| 191 |
+
from langchain_openai import ChatOpenAI
|
| 192 |
+
llm = ChatOpenAI(model="gpt-3.5-turbo")
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
**完全离线**:
|
| 196 |
+
```python
|
| 197 |
+
# 使用 tinyllama(最快下载)
|
| 198 |
+
LOCAL_LLM = "tinyllama"
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 📋 Kaggle 完整工作流程(优化版)
|
| 204 |
+
|
| 205 |
+
### 单元格 1: 初始化
|
| 206 |
+
```python
|
| 207 |
+
import os, subprocess, sys
|
| 208 |
+
|
| 209 |
+
os.chdir('/kaggle/working')
|
| 210 |
+
if not os.path.exists('adaptive_RAG'):
|
| 211 |
+
subprocess.run(['git', 'clone', 'https://github.com/LannyCodes/adaptive_RAG.git'])
|
| 212 |
+
|
| 213 |
+
os.chdir('adaptive_RAG')
|
| 214 |
+
|
| 215 |
+
# 修改配置使用更小的模型
|
| 216 |
+
with open('config.py', 'r') as f:
|
| 217 |
+
content = f.read()
|
| 218 |
+
|
| 219 |
+
content = content.replace('LOCAL_LLM = "mistral"', 'LOCAL_LLM = "phi"')
|
| 220 |
+
|
| 221 |
+
with open('config.py', 'w') as f:
|
| 222 |
+
f.write(content)
|
| 223 |
+
|
| 224 |
+
print("✅ 已切换到 phi 模型")
|
| 225 |
+
|
| 226 |
+
sys.path.insert(0, '/kaggle/working/adaptive_RAG')
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
### 单元格 2: 安装 Ollama
|
| 230 |
+
```python
|
| 231 |
+
# 安装 Ollama
|
| 232 |
+
subprocess.run('curl -fsSL https://ollama.com/install.sh | sh', shell=True)
|
| 233 |
+
|
| 234 |
+
# 启动服务
|
| 235 |
+
subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 236 |
+
time.sleep(15)
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
+
### 单元格 3: 下载优化的模型
|
| 240 |
+
```python
|
| 241 |
+
import time
|
| 242 |
+
|
| 243 |
+
# 使用更小的模型
|
| 244 |
+
print("📥 下载 phi 模型(约1.6GB,2-3分钟)...")
|
| 245 |
+
subprocess.run(['ollama', 'pull', 'phi'])
|
| 246 |
+
|
| 247 |
+
print("✅ 模型下载完成")
|
| 248 |
+
```
|
| 249 |
+
|
| 250 |
+
### 单元格 4: 安装依赖并运行
|
| 251 |
+
```python
|
| 252 |
+
!pip install -r requirements_graphrag.txt -q
|
| 253 |
+
|
| 254 |
+
# 继续您的处理...
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
## 🔢 时间对比
|
| 260 |
+
|
| 261 |
+
| 场景 | Mistral | Phi | TinyLlama | 云端API |
|
| 262 |
+
|-----|---------|-----|-----------|---------|
|
| 263 |
+
| **首次下载** | 5-10分钟 | 2-3分钟 | 1分钟 | 0分钟 |
|
| 264 |
+
| **后续会话** | 5-10分钟 | 2-3分钟 | 1分钟 | 0分钟 |
|
| 265 |
+
| **每周总耗时**<br>(5次会话) | 25-50分钟 | 10-15分钟 | 5分钟 | 0分钟 |
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 💰 成本对比
|
| 270 |
+
|
| 271 |
+
| 方案 | 时间成本 | 金钱成本 | 质量 |
|
| 272 |
+
|-----|---------|---------|------|
|
| 273 |
+
| Mistral | 高 ❌ | 免费 ✅ | 高 ✅ |
|
| 274 |
+
| Phi | 中 ✅ | 免费 ✅ | 中高 ✅ |
|
| 275 |
+
| TinyLlama | 低 ✅ | 免费 ✅ | 中 ⚠️ |
|
| 276 |
+
| GPT-3.5 API | 极低 ✅ | 约$0.5-2/天 ⚠️ | 极高 ✅ |
|
| 277 |
+
|
| 278 |
+
---
|
| 279 |
+
|
| 280 |
+
## 🎁 快速配置脚本
|
| 281 |
+
|
| 282 |
+
将以下代码保存为 `KAGGLE_QUICK_START.py`:
|
| 283 |
+
|
| 284 |
+
```python
|
| 285 |
+
"""
|
| 286 |
+
Kaggle 快速启动脚本 - 自动使用优化配置
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
import os
|
| 290 |
+
import subprocess
|
| 291 |
+
import sys
|
| 292 |
+
import time
|
| 293 |
+
|
| 294 |
+
print("🚀 Kaggle 快速启动(优化版)")
|
| 295 |
+
print("="*60)
|
| 296 |
+
|
| 297 |
+
# 1. 克隆项目
|
| 298 |
+
os.chdir('/kaggle/working')
|
| 299 |
+
if not os.path.exists('adaptive_RAG'):
|
| 300 |
+
subprocess.run(['git', 'clone', 'https://github.com/LannyCodes/adaptive_RAG.git'])
|
| 301 |
+
|
| 302 |
+
os.chdir('adaptive_RAG')
|
| 303 |
+
|
| 304 |
+
# 2. 自动选择模型(根据配置)
|
| 305 |
+
USE_SMALL_MODEL = True # 👈 改为 False 使用 Mistral
|
| 306 |
+
|
| 307 |
+
if USE_SMALL_MODEL:
|
| 308 |
+
MODEL_NAME = "phi"
|
| 309 |
+
print("✅ 使用优化模型: phi (1.6GB)")
|
| 310 |
+
else:
|
| 311 |
+
MODEL_NAME = "mistral"
|
| 312 |
+
print("✅ 使用标准模型: mistral (4GB)")
|
| 313 |
+
|
| 314 |
+
# 修改配置
|
| 315 |
+
with open('config.py', 'r') as f:
|
| 316 |
+
content = f.read()
|
| 317 |
+
|
| 318 |
+
content = content.replace(
|
| 319 |
+
'LOCAL_LLM = "mistral"',
|
| 320 |
+
f'LOCAL_LLM = "{MODEL_NAME}"'
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
with open('config.py', 'w') as f:
|
| 324 |
+
f.write(content)
|
| 325 |
+
|
| 326 |
+
# 3. 安装 Ollama
|
| 327 |
+
check = subprocess.run(['which', 'ollama'], capture_output=True)
|
| 328 |
+
if check.returncode != 0:
|
| 329 |
+
print("📥 安装 Ollama...")
|
| 330 |
+
subprocess.run('curl -fsSL https://ollama.com/install.sh | sh', shell=True)
|
| 331 |
+
|
| 332 |
+
# 4. 启动服务
|
| 333 |
+
subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 334 |
+
time.sleep(15)
|
| 335 |
+
|
| 336 |
+
# 5. 下载模型
|
| 337 |
+
print(f"📦 下载 {MODEL_NAME} 模型...")
|
| 338 |
+
subprocess.run(['ollama', 'pull', MODEL_NAME])
|
| 339 |
+
|
| 340 |
+
# 6. 安装依赖
|
| 341 |
+
print("📦 安装依赖...")
|
| 342 |
+
subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements_graphrag.txt', '-q'])
|
| 343 |
+
|
| 344 |
+
sys.path.insert(0, '/kaggle/working/adaptive_RAG')
|
| 345 |
+
|
| 346 |
+
print("\n" + "="*60)
|
| 347 |
+
print("✅ 环境准备完成!")
|
| 348 |
+
print("="*60)
|
| 349 |
+
print(f"\n📌 使用模型: {MODEL_NAME}")
|
| 350 |
+
print("📌 现在可以运行 GraphRAG 索引了")
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
---
|
| 354 |
+
|
| 355 |
+
## 总结
|
| 356 |
+
|
| 357 |
+
**最推荐的解决方案**:
|
| 358 |
+
|
| 359 |
+
1. ⭐⭐⭐⭐⭐ **使用 Phi 模型** - 平衡了速度和质量
|
| 360 |
+
2. ⭐⭐⭐⭐ **使用云端 API** - 适合生产环境
|
| 361 |
+
3. ⭐⭐⭐ **使用 TinyLlama** - 快速测试
|
| 362 |
+
|
| 363 |
+
**实际操作**:
|
| 364 |
+
- 只需将 `config.py` 中的 `LOCAL_LLM = "mistral"` 改为 `LOCAL_LLM = "phi"`
|
| 365 |
+
- 或在 Kaggle 中运行时自动替换(见快速启动脚本)
|
| 366 |
+
|
| 367 |
+
这样每次会话只需 2-3 分钟下载模型,而不是 5-10 分钟!
|
KAGGLE_QUICK_START.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kaggle 快速启动脚本 - 避免重复下载大模型
|
| 3 |
+
使用优化的小模型配置,大幅减少启动时间
|
| 4 |
+
|
| 5 |
+
使用方法:
|
| 6 |
+
在 Kaggle Notebook 第一个单元格运行:
|
| 7 |
+
exec(open('/kaggle/working/adaptive_RAG/KAGGLE_QUICK_START.py').read())
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
print("🚀 Kaggle 快速启动(优化版)")
|
| 16 |
+
print("="*70)
|
| 17 |
+
|
| 18 |
+
# ==================== 配置区域 ====================
|
| 19 |
+
REPO_URL = "https://github.com/LannyCodes/adaptive_RAG.git"
|
| 20 |
+
PROJECT_DIR = "/kaggle/working/adaptive_RAG"
|
| 21 |
+
|
| 22 |
+
# 模型选择(根据需求修改)
|
| 23 |
+
# "phi" - 1.6GB, 2-3分钟下载,质量好 ⭐⭐⭐⭐ (推荐)
|
| 24 |
+
# "tinyllama" - 600MB, 1分钟下载,质量中等 ⭐⭐⭐
|
| 25 |
+
# "qwen:0.5b" - 350MB, 30秒下载,质量较低 ⭐⭐
|
| 26 |
+
# "mistral" - 4GB, 5-10分钟下载,质量最好 ⭐⭐⭐⭐⭐ (慢)
|
| 27 |
+
|
| 28 |
+
PREFERRED_MODEL = "phi" # 👈 修改这里选择模型
|
| 29 |
+
|
| 30 |
+
print(f"\n📌 配置:")
|
| 31 |
+
print(f" • 仓库: {REPO_URL}")
|
| 32 |
+
print(f" • 模型: {PREFERRED_MODEL}")
|
| 33 |
+
print()
|
| 34 |
+
|
| 35 |
+
# ==================== 步骤 1: 克隆项目 ====================
|
| 36 |
+
print("📦 步骤 1/6: 克隆项目...")
|
| 37 |
+
|
| 38 |
+
os.chdir('/kaggle/working')
|
| 39 |
+
|
| 40 |
+
if os.path.exists(PROJECT_DIR):
|
| 41 |
+
print(" ✅ 项目已存在")
|
| 42 |
+
else:
|
| 43 |
+
result = subprocess.run(['git', 'clone', REPO_URL], capture_output=True, text=True)
|
| 44 |
+
if result.returncode == 0:
|
| 45 |
+
print(" ✅ 项目克隆成功")
|
| 46 |
+
else:
|
| 47 |
+
print(f" ❌ 克隆失败: {result.stderr}")
|
| 48 |
+
sys.exit(1)
|
| 49 |
+
|
| 50 |
+
os.chdir(PROJECT_DIR)
|
| 51 |
+
|
| 52 |
+
# ==================== 步骤 2: 修改配置使用小模型 ====================
|
| 53 |
+
print("\n⚙️ 步骤 2/6: 优化模型配置...")
|
| 54 |
+
|
| 55 |
+
config_file = 'config.py'
|
| 56 |
+
|
| 57 |
+
with open(config_file, 'r', encoding='utf-8') as f:
|
| 58 |
+
content = f.read()
|
| 59 |
+
|
| 60 |
+
# 替换模型配置
|
| 61 |
+
if 'LOCAL_LLM = "mistral"' in content:
|
| 62 |
+
content = content.replace(
|
| 63 |
+
'LOCAL_LLM = "mistral"',
|
| 64 |
+
f'LOCAL_LLM = "{PREFERRED_MODEL}" # Kaggle优化: 使用更小的模型'
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
with open(config_file, 'w', encoding='utf-8') as f:
|
| 68 |
+
f.write(content)
|
| 69 |
+
|
| 70 |
+
print(f" ✅ 已切换到 {PREFERRED_MODEL} 模型")
|
| 71 |
+
else:
|
| 72 |
+
print(f" ℹ️ 配置已是优化模式")
|
| 73 |
+
|
| 74 |
+
# ==================== 步骤 3: 检查并安装 Ollama ====================
|
| 75 |
+
print("\n🔧 步骤 3/6: 检查 Ollama...")
|
| 76 |
+
|
| 77 |
+
ollama_check = subprocess.run(['which', 'ollama'], capture_output=True)
|
| 78 |
+
|
| 79 |
+
if ollama_check.returncode == 0:
|
| 80 |
+
print(" ✅ Ollama 已安装")
|
| 81 |
+
else:
|
| 82 |
+
print(" 📥 安装 Ollama...")
|
| 83 |
+
subprocess.run('curl -fsSL https://ollama.com/install.sh | sh', shell=True)
|
| 84 |
+
time.sleep(3)
|
| 85 |
+
print(" ✅ Ollama 安装完成")
|
| 86 |
+
|
| 87 |
+
# 验证安装
|
| 88 |
+
version_result = subprocess.run(['ollama', '--version'], capture_output=True, text=True)
|
| 89 |
+
if version_result.returncode == 0:
|
| 90 |
+
print(f" 📌 {version_result.stdout.strip()}")
|
| 91 |
+
|
| 92 |
+
# ==================== 步骤 4: 启动 Ollama 服务 ====================
|
| 93 |
+
print("\n🚀 步骤 4/6: 启动 Ollama 服务...")
|
| 94 |
+
|
| 95 |
+
# 检查是否已运行
|
| 96 |
+
ps_check = subprocess.run(['pgrep', '-f', 'ollama serve'], capture_output=True)
|
| 97 |
+
|
| 98 |
+
if ps_check.returncode == 0:
|
| 99 |
+
print(" ✅ Ollama 服务已运行")
|
| 100 |
+
else:
|
| 101 |
+
print(" 🔄 启动服务...")
|
| 102 |
+
subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 103 |
+
time.sleep(15)
|
| 104 |
+
|
| 105 |
+
# 验证
|
| 106 |
+
import requests
|
| 107 |
+
try:
|
| 108 |
+
response = requests.get('http://localhost:11434/api/tags', timeout=10)
|
| 109 |
+
if response.status_code == 200:
|
| 110 |
+
print(" ✅ 服务运行正常")
|
| 111 |
+
except:
|
| 112 |
+
print(" ⚠️ 服务验证失败,但可能仍在启动中...")
|
| 113 |
+
|
| 114 |
+
# ==================== 步骤 5: 下载优化的模型 ====================
|
| 115 |
+
print(f"\n📦 步骤 5/6: 下载 {PREFERRED_MODEL} 模型...")
|
| 116 |
+
|
| 117 |
+
# 检查模型是否已存在
|
| 118 |
+
list_result = subprocess.run(['ollama', 'list'], capture_output=True, text=True)
|
| 119 |
+
|
| 120 |
+
if PREFERRED_MODEL in list_result.stdout:
|
| 121 |
+
print(f" ✅ {PREFERRED_MODEL} 模型已存在")
|
| 122 |
+
else:
|
| 123 |
+
# 显示预计时间
|
| 124 |
+
time_estimates = {
|
| 125 |
+
"qwen:0.5b": "约30秒",
|
| 126 |
+
"tinyllama": "约1分钟",
|
| 127 |
+
"phi": "约2-3分钟",
|
| 128 |
+
"mistral": "约5-10分钟"
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
estimated_time = time_estimates.get(PREFERRED_MODEL, "未知")
|
| 132 |
+
|
| 133 |
+
print(f" 📥 开始下载(预计时间: {estimated_time})...")
|
| 134 |
+
print(f" ⏳ 请稍候...")
|
| 135 |
+
|
| 136 |
+
start_time = time.time()
|
| 137 |
+
|
| 138 |
+
pull_result = subprocess.run(
|
| 139 |
+
['ollama', 'pull', PREFERRED_MODEL],
|
| 140 |
+
capture_output=True,
|
| 141 |
+
text=True
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
elapsed = time.time() - start_time
|
| 145 |
+
|
| 146 |
+
if pull_result.returncode == 0:
|
| 147 |
+
print(f" ✅ 模型下载完成(耗时: {int(elapsed)}秒)")
|
| 148 |
+
else:
|
| 149 |
+
print(f" ⚠️ 下载警告: {pull_result.stderr[:200]}")
|
| 150 |
+
|
| 151 |
+
# ==================== 步骤 6: 安装 Python 依赖 ====================
|
| 152 |
+
print("\n📦 步骤 6/6: 安装 Python 依赖...")
|
| 153 |
+
|
| 154 |
+
subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements_graphrag.txt', '-q'])
|
| 155 |
+
subprocess.run([sys.executable, '-m', 'pip', 'install', '-U',
|
| 156 |
+
'langchain', 'langchain-core', 'langchain-community',
|
| 157 |
+
'langchain-text-splitters', '-q'])
|
| 158 |
+
|
| 159 |
+
print(" ✅ 依赖安装完成")
|
| 160 |
+
|
| 161 |
+
# ==================== 设置 Python 路径 ====================
|
| 162 |
+
if PROJECT_DIR not in sys.path:
|
| 163 |
+
sys.path.insert(0, PROJECT_DIR)
|
| 164 |
+
|
| 165 |
+
# ==================== 完成 ====================
|
| 166 |
+
print("\n" + "="*70)
|
| 167 |
+
print("✅ 环境准备完成!")
|
| 168 |
+
print("="*70)
|
| 169 |
+
|
| 170 |
+
print(f"\n📊 配置摘要:")
|
| 171 |
+
print(f" • 工作目录: {os.getcwd()}")
|
| 172 |
+
print(f" • 使用模型: {PREFERRED_MODEL}")
|
| 173 |
+
print(f" • Python路径: 已添加")
|
| 174 |
+
|
| 175 |
+
# 显示模型对比
|
| 176 |
+
print(f"\n📌 模型选择说明:")
|
| 177 |
+
print(" • phi (当前) - 平衡速度和质量,推荐日常使用")
|
| 178 |
+
print(" • tinyllama - 最快下载,适合快速测试")
|
| 179 |
+
print(" • mistral - 质量最高,但下载慢(不推荐Kaggle)")
|
| 180 |
+
|
| 181 |
+
print(f"\n💡 下一步:")
|
| 182 |
+
print(" 1. 开始 GraphRAG 索引:")
|
| 183 |
+
print(" from document_processor import DocumentProcessor")
|
| 184 |
+
print(" from graph_indexer import GraphRAGIndexer")
|
| 185 |
+
print(" ")
|
| 186 |
+
print(" doc_processor = DocumentProcessor()")
|
| 187 |
+
print(" vectorstore, retriever, doc_splits = doc_processor.setup_knowledge_base(enable_graphrag=True)")
|
| 188 |
+
print(" ")
|
| 189 |
+
print(" indexer = GraphRAGIndexer()")
|
| 190 |
+
print(" graph = indexer.index_documents(doc_splits, batch_size=3)")
|
| 191 |
+
print()
|
| 192 |
+
print(" 2. 如需切换模型,修改脚本顶部的 PREFERRED_MODEL 变量")
|
| 193 |
+
|
| 194 |
+
print("\n⚠️ 提示:")
|
| 195 |
+
print(f" • 当前使用 {PREFERRED_MODEL} 模型,比 Mistral 快 {2 if PREFERRED_MODEL == 'phi' else 5}x")
|
| 196 |
+
print(" • 会话结束后仍需重新下载(但速度已大幅提升)")
|
| 197 |
+
print(" • 如需最佳质量,本地开发时可用 Mistral")
|
colab_setup_and_run.py
DELETED
|
@@ -1,375 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Google Colab环境下的GraphRAG完整运行脚本
|
| 4 |
-
解决Ollama服务启动和GraphRAG运行的问题
|
| 5 |
-
|
| 6 |
-
使用方法:
|
| 7 |
-
1. 在Colab中启用GPU
|
| 8 |
-
2. 复制此文件到Colab
|
| 9 |
-
3. 运行: !python colab_setup_and_run.py
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import os
|
| 13 |
-
import sys
|
| 14 |
-
import time
|
| 15 |
-
import subprocess
|
| 16 |
-
import signal
|
| 17 |
-
from pathlib import Path
|
| 18 |
-
|
| 19 |
-
print("="*70)
|
| 20 |
-
print("🚀 GraphRAG Colab 自动化部署脚本")
|
| 21 |
-
print("="*70)
|
| 22 |
-
|
| 23 |
-
# ============================================================
|
| 24 |
-
# 1️⃣ 检测Colab环境
|
| 25 |
-
# ============================================================
|
| 26 |
-
def check_colab_environment():
|
| 27 |
-
"""检测是否在Colab环境中"""
|
| 28 |
-
try:
|
| 29 |
-
import google.colab
|
| 30 |
-
print("\n✅ 运行环境: Google Colab")
|
| 31 |
-
return True
|
| 32 |
-
except ImportError:
|
| 33 |
-
print("\n⚠️ 警告: 未检测到Colab环境")
|
| 34 |
-
print(" 本脚本为Colab优化,在其他环境可能需要调整")
|
| 35 |
-
return False
|
| 36 |
-
|
| 37 |
-
# ============================================================
|
| 38 |
-
# 2️⃣ 安装Ollama
|
| 39 |
-
# ============================================================
|
| 40 |
-
def install_ollama():
|
| 41 |
-
"""在Colab中安装Ollama"""
|
| 42 |
-
print("\n" + "="*70)
|
| 43 |
-
print("📦 步骤1: 安装Ollama")
|
| 44 |
-
print("="*70)
|
| 45 |
-
|
| 46 |
-
# 检查是否已安装
|
| 47 |
-
if os.path.exists("/usr/local/bin/ollama"):
|
| 48 |
-
print("✅ Ollama已安装")
|
| 49 |
-
return True
|
| 50 |
-
|
| 51 |
-
print("\n📥 下载并安装Ollama...")
|
| 52 |
-
try:
|
| 53 |
-
# 下载Ollama安装脚本
|
| 54 |
-
subprocess.run(
|
| 55 |
-
["curl", "-fsSL", "https://ollama.com/install.sh", "-o", "/tmp/install_ollama.sh"],
|
| 56 |
-
check=True,
|
| 57 |
-
capture_output=True
|
| 58 |
-
)
|
| 59 |
-
|
| 60 |
-
# 执行安装
|
| 61 |
-
subprocess.run(
|
| 62 |
-
["sh", "/tmp/install_ollama.sh"],
|
| 63 |
-
check=True,
|
| 64 |
-
capture_output=True
|
| 65 |
-
)
|
| 66 |
-
|
| 67 |
-
print("✅ Ollama安装成功")
|
| 68 |
-
return True
|
| 69 |
-
|
| 70 |
-
except subprocess.CalledProcessError as e:
|
| 71 |
-
print(f"❌ Ollama安装失败: {e}")
|
| 72 |
-
return False
|
| 73 |
-
|
| 74 |
-
# ============================================================
|
| 75 |
-
# 3️⃣ 后台启动Ollama服务
|
| 76 |
-
# ============================================================
|
| 77 |
-
def start_ollama_service():
|
| 78 |
-
"""在后台启动Ollama服务"""
|
| 79 |
-
print("\n" + "="*70)
|
| 80 |
-
print("🔧 步骤2: 启动Ollama服务")
|
| 81 |
-
print("="*70)
|
| 82 |
-
|
| 83 |
-
print("\n🔄 在后台启动Ollama服务...")
|
| 84 |
-
|
| 85 |
-
# 方法1: 使用subprocess后台运行
|
| 86 |
-
try:
|
| 87 |
-
# 启动Ollama服务(后台)
|
| 88 |
-
ollama_process = subprocess.Popen(
|
| 89 |
-
["ollama", "serve"],
|
| 90 |
-
stdout=subprocess.PIPE,
|
| 91 |
-
stderr=subprocess.PIPE,
|
| 92 |
-
preexec_fn=os.setpgrp # 创建新的进程组
|
| 93 |
-
)
|
| 94 |
-
|
| 95 |
-
# 等待服务启动
|
| 96 |
-
print("⏳ 等待Ollama服务启动...")
|
| 97 |
-
time.sleep(5)
|
| 98 |
-
|
| 99 |
-
# 检查服务是否运行
|
| 100 |
-
try:
|
| 101 |
-
result = subprocess.run(
|
| 102 |
-
["curl", "-s", "http://localhost:11434/api/tags"],
|
| 103 |
-
capture_output=True,
|
| 104 |
-
timeout=3
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
if result.returncode == 0:
|
| 108 |
-
print("✅ Ollama服务已启动 (PID: {})".format(ollama_process.pid))
|
| 109 |
-
|
| 110 |
-
# 保存进程ID以便后续管理
|
| 111 |
-
with open("/tmp/ollama.pid", "w") as f:
|
| 112 |
-
f.write(str(ollama_process.pid))
|
| 113 |
-
|
| 114 |
-
return ollama_process
|
| 115 |
-
else:
|
| 116 |
-
print("⚠️ 服务启动可能有问题,继续尝试...")
|
| 117 |
-
|
| 118 |
-
except subprocess.TimeoutExpired:
|
| 119 |
-
print("⚠️ 服务检查超时,但进程已启动")
|
| 120 |
-
return ollama_process
|
| 121 |
-
|
| 122 |
-
except Exception as e:
|
| 123 |
-
print(f"❌ 启动Ollama失败: {e}")
|
| 124 |
-
return None
|
| 125 |
-
|
| 126 |
-
# ============================================================
|
| 127 |
-
# 4️⃣ 下载Mistral模型
|
| 128 |
-
# ============================================================
|
| 129 |
-
def pull_mistral_model():
|
| 130 |
-
"""下载Mistral模型"""
|
| 131 |
-
print("\n" + "="*70)
|
| 132 |
-
print("📥 步骤3: 下载Mistral模型")
|
| 133 |
-
print("="*70)
|
| 134 |
-
|
| 135 |
-
print("\n🔄 拉取mistral模型(这可能需要几分钟)...")
|
| 136 |
-
|
| 137 |
-
try:
|
| 138 |
-
# 检查模型是否已存在
|
| 139 |
-
result = subprocess.run(
|
| 140 |
-
["ollama", "list"],
|
| 141 |
-
capture_output=True,
|
| 142 |
-
text=True,
|
| 143 |
-
timeout=10
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
if "mistral" in result.stdout:
|
| 147 |
-
print("✅ Mistral模型已存在")
|
| 148 |
-
return True
|
| 149 |
-
|
| 150 |
-
# 下载模型
|
| 151 |
-
print("📥 开始下载Mistral模型...")
|
| 152 |
-
process = subprocess.Popen(
|
| 153 |
-
["ollama", "pull", "mistral"],
|
| 154 |
-
stdout=subprocess.PIPE,
|
| 155 |
-
stderr=subprocess.STDOUT,
|
| 156 |
-
text=True
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
# 实时显示下载进度
|
| 160 |
-
for line in process.stdout:
|
| 161 |
-
print(f" {line.strip()}")
|
| 162 |
-
|
| 163 |
-
process.wait()
|
| 164 |
-
|
| 165 |
-
if process.returncode == 0:
|
| 166 |
-
print("✅ Mistral模型下载完成")
|
| 167 |
-
return True
|
| 168 |
-
else:
|
| 169 |
-
print("❌ 模型下载失败")
|
| 170 |
-
return False
|
| 171 |
-
|
| 172 |
-
except Exception as e:
|
| 173 |
-
print(f"❌ 下载Mistral模型失败: {e}")
|
| 174 |
-
return False
|
| 175 |
-
|
| 176 |
-
# ============================================================
|
| 177 |
-
# 5️⃣ 安装Python依赖
|
| 178 |
-
# ============================================================
|
| 179 |
-
def install_python_dependencies():
|
| 180 |
-
"""安装GraphRAG所需的Python包"""
|
| 181 |
-
print("\n" + "="*70)
|
| 182 |
-
print("📦 步骤4: 安装Python依赖")
|
| 183 |
-
print("="*70)
|
| 184 |
-
|
| 185 |
-
packages = [
|
| 186 |
-
"langchain",
|
| 187 |
-
"langchain-community",
|
| 188 |
-
"langchain-core",
|
| 189 |
-
"langgraph",
|
| 190 |
-
"langchain-ollama",
|
| 191 |
-
"chromadb",
|
| 192 |
-
"sentence-transformers",
|
| 193 |
-
"tiktoken",
|
| 194 |
-
"beautifulsoup4",
|
| 195 |
-
"requests",
|
| 196 |
-
"tavily-python",
|
| 197 |
-
"python-dotenv",
|
| 198 |
-
"networkx",
|
| 199 |
-
"python-louvain",
|
| 200 |
-
"torch",
|
| 201 |
-
"transformers"
|
| 202 |
-
]
|
| 203 |
-
|
| 204 |
-
print("\n📥 安装必要的Python包...")
|
| 205 |
-
for package in packages:
|
| 206 |
-
try:
|
| 207 |
-
__import__(package.replace("-", "_"))
|
| 208 |
-
print(f"✅ {package} 已安装")
|
| 209 |
-
except ImportError:
|
| 210 |
-
print(f"📥 安装 {package}...")
|
| 211 |
-
subprocess.run(
|
| 212 |
-
[sys.executable, "-m", "pip", "install", "-q", package],
|
| 213 |
-
check=True
|
| 214 |
-
)
|
| 215 |
-
|
| 216 |
-
print("\n✅ 所有依赖安装完成")
|
| 217 |
-
|
| 218 |
-
# ============================================================
|
| 219 |
-
# 6️⃣ 配置环境变量
|
| 220 |
-
# ============================================================
|
| 221 |
-
def setup_environment():
|
| 222 |
-
"""配置环境变量"""
|
| 223 |
-
print("\n" + "="*70)
|
| 224 |
-
print("🔑 步骤5: 配置环境变量")
|
| 225 |
-
print("="*70)
|
| 226 |
-
|
| 227 |
-
# 检查.env文件
|
| 228 |
-
if os.path.exists(".env"):
|
| 229 |
-
print("\n✅ 发现.env文件,加载配置...")
|
| 230 |
-
from dotenv import load_dotenv
|
| 231 |
-
load_dotenv()
|
| 232 |
-
else:
|
| 233 |
-
print("\n⚠️ 未找到.env文件")
|
| 234 |
-
|
| 235 |
-
# 交互式输入API密钥
|
| 236 |
-
if "TAVILY_API_KEY" not in os.environ:
|
| 237 |
-
from getpass import getpass
|
| 238 |
-
api_key = getpass("请输入TAVILY_API_KEY (或按Enter跳过): ")
|
| 239 |
-
if api_key:
|
| 240 |
-
os.environ["TAVILY_API_KEY"] = api_key
|
| 241 |
-
print("✅ TAVILY_API_KEY已设置")
|
| 242 |
-
else:
|
| 243 |
-
print("⚠️ 跳过TAVILY_API_KEY设置(网络搜索功能将不可用)")
|
| 244 |
-
|
| 245 |
-
print("\n📋 当前环境变量:")
|
| 246 |
-
print(f" TAVILY_API_KEY: {'已设置' if os.environ.get('TAVILY_API_KEY') else '未设置'}")
|
| 247 |
-
|
| 248 |
-
# ============================================================
|
| 249 |
-
# 7️⃣ 运行GraphRAG
|
| 250 |
-
# ============================================================
|
| 251 |
-
def run_graphrag():
|
| 252 |
-
"""运行GraphRAG主程序"""
|
| 253 |
-
print("\n" + "="*70)
|
| 254 |
-
print("🚀 步骤6: 运行GraphRAG")
|
| 255 |
-
print("="*70)
|
| 256 |
-
|
| 257 |
-
# 检查main_graphrag.py是否存在
|
| 258 |
-
if not os.path.exists("main_graphrag.py"):
|
| 259 |
-
print("\n❌ 未找到main_graphrag.py文件")
|
| 260 |
-
print(" 请确保已上传项目文件到Colab")
|
| 261 |
-
return False
|
| 262 |
-
|
| 263 |
-
print("\n🔄 启动GraphRAG索引构建...\n")
|
| 264 |
-
|
| 265 |
-
try:
|
| 266 |
-
# 运行GraphRAG
|
| 267 |
-
result = subprocess.run(
|
| 268 |
-
[sys.executable, "main_graphrag.py"],
|
| 269 |
-
capture_output=False, # 实时输出
|
| 270 |
-
text=True
|
| 271 |
-
)
|
| 272 |
-
|
| 273 |
-
if result.returncode == 0:
|
| 274 |
-
print("\n✅ GraphRAG运行成功!")
|
| 275 |
-
return True
|
| 276 |
-
else:
|
| 277 |
-
print(f"\n❌ GraphRAG运行失败 (返回码: {result.returncode})")
|
| 278 |
-
return False
|
| 279 |
-
|
| 280 |
-
except KeyboardInterrupt:
|
| 281 |
-
print("\n⚠️ 用户中断执行")
|
| 282 |
-
return False
|
| 283 |
-
except Exception as e:
|
| 284 |
-
print(f"\n❌ 运行GraphRAG时出错: {e}")
|
| 285 |
-
return False
|
| 286 |
-
|
| 287 |
-
# ============================================================
|
| 288 |
-
# 8️⃣ 清理函数
|
| 289 |
-
# ============================================================
|
| 290 |
-
def cleanup():
|
| 291 |
-
"""清理后台进程"""
|
| 292 |
-
print("\n" + "="*70)
|
| 293 |
-
print("🧹 清理后台进程")
|
| 294 |
-
print("="*70)
|
| 295 |
-
|
| 296 |
-
# 停止Ollama服务
|
| 297 |
-
if os.path.exists("/tmp/ollama.pid"):
|
| 298 |
-
try:
|
| 299 |
-
with open("/tmp/ollama.pid", "r") as f:
|
| 300 |
-
pid = int(f.read().strip())
|
| 301 |
-
|
| 302 |
-
os.kill(pid, signal.SIGTERM)
|
| 303 |
-
print(f"✅ Ollama服务已停止 (PID: {pid})")
|
| 304 |
-
os.remove("/tmp/ollama.pid")
|
| 305 |
-
|
| 306 |
-
except Exception as e:
|
| 307 |
-
print(f"⚠️ 停止Ollama服务失败: {e}")
|
| 308 |
-
|
| 309 |
-
# ============================================================
|
| 310 |
-
# 主函数
|
| 311 |
-
# ============================================================
|
| 312 |
-
def main():
|
| 313 |
-
"""主执行流程"""
|
| 314 |
-
ollama_process = None
|
| 315 |
-
|
| 316 |
-
try:
|
| 317 |
-
# 1. 检测环境
|
| 318 |
-
is_colab = check_colab_environment()
|
| 319 |
-
|
| 320 |
-
# 2. 安装Ollama
|
| 321 |
-
if not install_ollama():
|
| 322 |
-
print("\n❌ Ollama安装失败,无法继续")
|
| 323 |
-
return
|
| 324 |
-
|
| 325 |
-
# 3. 启动Ollama服务
|
| 326 |
-
ollama_process = start_ollama_service()
|
| 327 |
-
if not ollama_process:
|
| 328 |
-
print("\n❌ Ollama服务启动失败,无法继续")
|
| 329 |
-
return
|
| 330 |
-
|
| 331 |
-
# 4. 下载模型
|
| 332 |
-
if not pull_mistral_model():
|
| 333 |
-
print("\n❌ Mistral模型下载失败,无法继续")
|
| 334 |
-
return
|
| 335 |
-
|
| 336 |
-
# 5. 安装Python依赖
|
| 337 |
-
install_python_dependencies()
|
| 338 |
-
|
| 339 |
-
# 6. 配置环境
|
| 340 |
-
setup_environment()
|
| 341 |
-
|
| 342 |
-
# 7. 运行GraphRAG
|
| 343 |
-
success = run_graphrag()
|
| 344 |
-
|
| 345 |
-
if success:
|
| 346 |
-
print("\n" + "="*70)
|
| 347 |
-
print("✅ 所有任务完成!")
|
| 348 |
-
print("="*70)
|
| 349 |
-
|
| 350 |
-
print("\n📊 生成的文件:")
|
| 351 |
-
if os.path.exists("data/knowledge_graph.json"):
|
| 352 |
-
print(" ✅ data/knowledge_graph.json")
|
| 353 |
-
|
| 354 |
-
# 提供下载选项
|
| 355 |
-
if is_colab:
|
| 356 |
-
print("\n💾 下载结果:")
|
| 357 |
-
print(" from google.colab import files")
|
| 358 |
-
print(" files.download('data/knowledge_graph.json')")
|
| 359 |
-
|
| 360 |
-
except KeyboardInterrupt:
|
| 361 |
-
print("\n\n⚠️ 用户中断执行")
|
| 362 |
-
|
| 363 |
-
except Exception as e:
|
| 364 |
-
print(f"\n❌ 执行过程中出错: {e}")
|
| 365 |
-
import traceback
|
| 366 |
-
traceback.print_exc()
|
| 367 |
-
|
| 368 |
-
finally:
|
| 369 |
-
# 清理
|
| 370 |
-
print("\n⚠️ 注意: Ollama服务仍在后台运行")
|
| 371 |
-
print(" 如需停止: !pkill -f 'ollama serve'")
|
| 372 |
-
print(" 或运行: cleanup()")
|
| 373 |
-
|
| 374 |
-
if __name__ == "__main__":
|
| 375 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.py
CHANGED
|
@@ -37,7 +37,13 @@ def setup_environment():
|
|
| 37 |
|
| 38 |
|
| 39 |
# 模型配置
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# 知识库URL配置
|
| 43 |
KNOWLEDGE_BASE_URLS = [
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
# 模型配置
|
| 40 |
+
# Kaggle环境推荐使用较小的模型以加快下载速度
|
| 41 |
+
# 可选模型:
|
| 42 |
+
# - "mistral" (4GB) - 质量最好,但下载慢
|
| 43 |
+
# - "phi" (1.6GB) - 平衡选择,速度较快
|
| 44 |
+
# - "tinyllama" (600MB) - 最快,质量稍低
|
| 45 |
+
# - "qwen:0.5b" (350MB) - 极小模型,速度极快
|
| 46 |
+
LOCAL_LLM = "mistral" # 在Kaggle中可改为 "phi" 或 "tinyllama"
|
| 47 |
|
| 48 |
# 知识库URL配置
|
| 49 |
KNOWLEDGE_BASE_URLS = [
|
setup_and_run.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
环境配置和运行脚本
|
| 4 |
+
简化版:只负责配置环境和运行 main_graphrag.py
|
| 5 |
+
|
| 6 |
+
使用方法:
|
| 7 |
+
python colab_setup_and_run.py
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import subprocess
|
| 13 |
+
|
| 14 |
+
print("="*60)
|
| 15 |
+
print("🚀 GraphRAG 环境配置和运行")
|
| 16 |
+
print("="*60)
|
| 17 |
+
|
| 18 |
+
# ============================================================
|
| 19 |
+
# 1. 配置环境
|
| 20 |
+
# ============================================================
|
| 21 |
+
def setup_environment():
|
| 22 |
+
"""配置环境变量"""
|
| 23 |
+
print("\n⚙️ 步骤 1/2: 配置环境变量...")
|
| 24 |
+
|
| 25 |
+
# 检查.env文件
|
| 26 |
+
if os.path.exists(".env"):
|
| 27 |
+
print(" ✅ 发现 .env 文件,加载配置...")
|
| 28 |
+
try:
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
load_dotenv()
|
| 31 |
+
print(" ✅ 环境变量已加载")
|
| 32 |
+
except ImportError:
|
| 33 |
+
print(" ⚠️ python-dotenv 未安装,跳过 .env 加载")
|
| 34 |
+
else:
|
| 35 |
+
print(" ℹ️ 未找到 .env 文件")
|
| 36 |
+
|
| 37 |
+
# 显示环境变量状态
|
| 38 |
+
print("\n 📋 环境变量状态:")
|
| 39 |
+
print(f" • TAVILY_API_KEY: {'✅ 已设置' if os.environ.get('TAVILY_API_KEY') else '⚠️ 未设置'}")
|
| 40 |
+
print(f" • NOMIC_API_KEY: {'✅ 已设置' if os.environ.get('NOMIC_API_KEY') else '⚠️ 未设置'}")
|
| 41 |
+
|
| 42 |
+
# 添加当前目录到 Python 路径
|
| 43 |
+
current_dir = os.getcwd()
|
| 44 |
+
if current_dir not in sys.path:
|
| 45 |
+
sys.path.insert(0, current_dir)
|
| 46 |
+
print(f"\n ✅ 已添加到 Python 路径: {current_dir}")
|
| 47 |
+
|
| 48 |
+
# ============================================================
|
| 49 |
+
# 2. 运行 main_graphrag.py
|
| 50 |
+
# ============================================================
|
| 51 |
+
def run_main_graphrag():
|
| 52 |
+
"""运行 main_graphrag.py"""
|
| 53 |
+
print("\n🚀 步骤 2/2: 运行 main_graphrag.py...")
|
| 54 |
+
print("="*60)
|
| 55 |
+
|
| 56 |
+
# 检查文件是否存在
|
| 57 |
+
if not os.path.exists("main_graphrag.py"):
|
| 58 |
+
print("\n❌ 错误: 未找到 main_graphrag.py 文件")
|
| 59 |
+
print(" 请确保在正确的目录中运行此脚本")
|
| 60 |
+
return False
|
| 61 |
+
|
| 62 |
+
print("\n🔄 启动 GraphRAG...\n")
|
| 63 |
+
|
| 64 |
+
try:
|
| 65 |
+
# 运行 main_graphrag.py
|
| 66 |
+
result = subprocess.run(
|
| 67 |
+
[sys.executable, "main_graphrag.py"],
|
| 68 |
+
capture_output=False, # 实时显示输出
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if result.returncode == 0:
|
| 72 |
+
print("\n" + "="*60)
|
| 73 |
+
print("✅ 运行成功!")
|
| 74 |
+
print("="*60)
|
| 75 |
+
return True
|
| 76 |
+
else:
|
| 77 |
+
print("\n" + "="*60)
|
| 78 |
+
print(f"❌ 运行失败 (返回码: {result.returncode})")
|
| 79 |
+
print("="*60)
|
| 80 |
+
return False
|
| 81 |
+
|
| 82 |
+
except KeyboardInterrupt:
|
| 83 |
+
print("\n\n⚠️ 用户中断执行")
|
| 84 |
+
return False
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"\n❌ 运行时错误: {e}")
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
# ============================================================
|
| 90 |
+
# 主函数
|
| 91 |
+
# ============================================================
|
| 92 |
+
def main():
|
| 93 |
+
"""主执行流程"""
|
| 94 |
+
try:
|
| 95 |
+
# 1. 配置环境
|
| 96 |
+
setup_environment()
|
| 97 |
+
|
| 98 |
+
# 2. 运行 main_graphrag.py
|
| 99 |
+
success = run_main_graphrag()
|
| 100 |
+
|
| 101 |
+
if success:
|
| 102 |
+
print("\n💡 提示: 生成的知识图谱保存在配置的路径中")
|
| 103 |
+
|
| 104 |
+
except KeyboardInterrupt:
|
| 105 |
+
print("\n\n⚠️ 用户中断执行")
|
| 106 |
+
except Exception as e:
|
| 107 |
+
print(f"\n❌ 执行过程中出错: {e}")
|
| 108 |
+
import traceback
|
| 109 |
+
traceback.print_exc()
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
main()
|