Spaces:
Paused
Paused
lanny xu
commited on
Commit
·
215d348
1
Parent(s):
8821b53
add Milvus db
Browse files- config.py +12 -0
- document_processor.py +67 -6
- requirements.txt +1 -0
config.py
CHANGED
|
@@ -60,9 +60,21 @@ CHUNK_SIZE = 250
|
|
| 60 |
CHUNK_OVERLAP = 50 # 添加重叠以保持上下文连贯性,提升检索准确率
|
| 61 |
|
| 62 |
# 向量数据库配置
|
|
|
|
| 63 |
COLLECTION_NAME = "rag-chroma"
|
| 64 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # HuggingFace嵌入模型
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
# 搜索配置
|
| 67 |
WEB_SEARCH_RESULTS_COUNT = 3
|
| 68 |
|
|
|
|
| 60 |
CHUNK_OVERLAP = 50 # 添加重叠以保持上下文连贯性,提升检索准确率
|
| 61 |
|
| 62 |
# 向量数据库配置
|
| 63 |
+
VECTOR_STORE_TYPE = os.environ.get("VECTOR_STORE_TYPE", "chroma") # 可选: "chroma", "milvus"
|
| 64 |
COLLECTION_NAME = "rag-chroma"
|
| 65 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # HuggingFace嵌入模型
|
| 66 |
|
| 67 |
+
# Milvus 配置 (仅当 VECTOR_STORE_TYPE="milvus" 时生效)
|
| 68 |
+
# 1. Milvus Lite (本地文件模式): 仅需设置 MILVUS_URI,无需 User/Password。适合 Kaggle/本地开发。
|
| 69 |
+
# 2. Zilliz Cloud (云服务): 需要设置 MILVUS_URI (https://...) 和 MILVUS_PASSWORD (API Key/Token)。需官网注册。
|
| 70 |
+
# 3. Milvus Server (Docker/K8s): 需要设置 HOST/PORT,可选 User/Password。
|
| 71 |
+
MILVUS_HOST = os.environ.get("MILVUS_HOST", "localhost")
|
| 72 |
+
MILVUS_PORT = os.environ.get("MILVUS_PORT", "19530")
|
| 73 |
+
MILVUS_USER = os.environ.get("MILVUS_USER", "") # 仅在自建 Server 开启认证或使用 Zilliz Cloud 时需要
|
| 74 |
+
MILVUS_PASSWORD = os.environ.get("MILVUS_PASSWORD", "") # Zilliz Cloud 的 API Key 也填在这里
|
| 75 |
+
# Milvus Lite 配置: 如果设置了 MILVUS_URI (如 "./milvus_demo.db"),将优先使用本地文件模式
|
| 76 |
+
MILVUS_URI = os.environ.get("MILVUS_URI", "./milvus_rag.db")
|
| 77 |
+
|
| 78 |
# 搜索配置
|
| 79 |
WEB_SEARCH_RESULTS_COUNT = 3
|
| 80 |
|
document_processor.py
CHANGED
|
@@ -25,6 +25,13 @@ from config import (
|
|
| 25 |
KEYWORD_SEARCH_K,
|
| 26 |
BM25_K1,
|
| 27 |
BM25_B,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# 查询扩展配置
|
| 29 |
ENABLE_QUERY_EXPANSION,
|
| 30 |
QUERY_EXPANSION_MODEL,
|
|
@@ -236,12 +243,66 @@ class DocumentProcessor:
|
|
| 236 |
os.makedirs(persist_directory, exist_ok=True)
|
| 237 |
print(f"💾 使用默认持久化目录: {persist_directory}")
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
self.retriever = self.vectorstore.as_retriever()
|
| 246 |
|
| 247 |
# 如果启用混合检索,创建BM25检索器和集成检索器
|
|
|
|
| 25 |
KEYWORD_SEARCH_K,
|
| 26 |
BM25_K1,
|
| 27 |
BM25_B,
|
| 28 |
+
# 向量库配置
|
| 29 |
+
VECTOR_STORE_TYPE,
|
| 30 |
+
MILVUS_HOST,
|
| 31 |
+
MILVUS_PORT,
|
| 32 |
+
MILVUS_USER,
|
| 33 |
+
MILVUS_PASSWORD,
|
| 34 |
+
MILVUS_URI,
|
| 35 |
# 查询扩展配置
|
| 36 |
ENABLE_QUERY_EXPANSION,
|
| 37 |
QUERY_EXPANSION_MODEL,
|
|
|
|
| 243 |
os.makedirs(persist_directory, exist_ok=True)
|
| 244 |
print(f"💾 使用默认持久化目录: {persist_directory}")
|
| 245 |
|
| 246 |
+
if VECTOR_STORE_TYPE.lower() == "milvus":
|
| 247 |
+
try:
|
| 248 |
+
from langchain_community.vectorstores import Milvus
|
| 249 |
+
|
| 250 |
+
# 准备连接参数
|
| 251 |
+
connection_args = {}
|
| 252 |
+
|
| 253 |
+
# 优先使用 URI (支持 Milvus Lite 本地文件 或 Zilliz Cloud)
|
| 254 |
+
# 只要 MILVUS_URI 被设置(config中默认是 ./milvus_rag.db),且不是空字符串
|
| 255 |
+
if MILVUS_URI and len(MILVUS_URI.strip()) > 0:
|
| 256 |
+
# 判断是本地文件还是云服务
|
| 257 |
+
is_local_file = not (MILVUS_URI.startswith("http://") or MILVUS_URI.startswith("https://"))
|
| 258 |
+
mode_name = "Lite (Local File)" if is_local_file else "Cloud (HTTP)"
|
| 259 |
+
|
| 260 |
+
print(f"🔄 正在连接 Milvus {mode_name} ({MILVUS_URI})...")
|
| 261 |
+
connection_args["uri"] = MILVUS_URI
|
| 262 |
+
|
| 263 |
+
# 如果是云服务,通常需要 token (使用 password 字段作为 token)
|
| 264 |
+
if not is_local_file and MILVUS_PASSWORD:
|
| 265 |
+
connection_args["token"] = MILVUS_PASSWORD
|
| 266 |
+
else:
|
| 267 |
+
# 传统的 Host/Port 连接
|
| 268 |
+
print(f"🔄 正在连接 Milvus Server ({MILVUS_HOST}:{MILVUS_PORT})...")
|
| 269 |
+
connection_args = {
|
| 270 |
+
"host": MILVUS_HOST,
|
| 271 |
+
"port": MILVUS_PORT,
|
| 272 |
+
"user": MILVUS_USER,
|
| 273 |
+
"password": MILVUS_PASSWORD
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
self.vectorstore = Milvus.from_documents(
|
| 277 |
+
documents=doc_splits,
|
| 278 |
+
embedding=self.embeddings,
|
| 279 |
+
collection_name=COLLECTION_NAME,
|
| 280 |
+
connection_args=connection_args,
|
| 281 |
+
drop_old=True # 重新创建索引
|
| 282 |
+
)
|
| 283 |
+
print("✅ Milvus 向量数据库初始化成功")
|
| 284 |
+
except ImportError:
|
| 285 |
+
print("❌ 未安装 pymilvus,请运行: pip install pymilvus")
|
| 286 |
+
raise
|
| 287 |
+
except Exception as e:
|
| 288 |
+
print(f"❌ Milvus 连接失败: {e}")
|
| 289 |
+
print("⚠️ 回退到 Chroma 数据库...")
|
| 290 |
+
# Fallback to Chroma
|
| 291 |
+
self.vectorstore = Chroma.from_documents(
|
| 292 |
+
documents=doc_splits,
|
| 293 |
+
collection_name=COLLECTION_NAME,
|
| 294 |
+
embedding=self.embeddings,
|
| 295 |
+
persist_directory=persist_directory
|
| 296 |
+
)
|
| 297 |
+
else:
|
| 298 |
+
# Default: Chroma
|
| 299 |
+
self.vectorstore = Chroma.from_documents(
|
| 300 |
+
documents=doc_splits,
|
| 301 |
+
collection_name=COLLECTION_NAME,
|
| 302 |
+
embedding=self.embeddings,
|
| 303 |
+
persist_directory=persist_directory # 添加持久化目录
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
self.retriever = self.vectorstore.as_retriever()
|
| 307 |
|
| 308 |
# 如果启用混合检索,创建BM25检索器和集成检索器
|
requirements.txt
CHANGED
|
@@ -12,6 +12,7 @@ langchain-ollama>=0.1.0
|
|
| 12 |
|
| 13 |
# 向量数据库和嵌入
|
| 14 |
chromadb>=0.4.0
|
|
|
|
| 15 |
sentence-transformers>=2.2.0
|
| 16 |
torch>=2.0.0
|
| 17 |
transformers>=4.30.0
|
|
|
|
| 12 |
|
| 13 |
# 向量数据库和嵌入
|
| 14 |
chromadb>=0.4.0
|
| 15 |
+
pymilvus>=2.4.2 # Milvus 客户端及 Lite 本地模式支持
|
| 16 |
sentence-transformers>=2.2.0
|
| 17 |
torch>=2.0.0
|
| 18 |
transformers>=4.30.0
|