Spaces:
Sleeping
Sleeping
Update rag_engine.py
Browse files- rag_engine.py +26 -2
rag_engine.py
CHANGED
|
@@ -55,14 +55,20 @@ def retrieve_relevant_chunks(
|
|
| 55 |
"""
|
| 56 |
用 embedding 对当前问题做一次检索,从 rag_chunks 中找出最相关的 top_k 段落,
|
| 57 |
返回拼接后的文本,供 prompt 使用。
|
|
|
|
| 58 |
"""
|
|
|
|
|
|
|
|
|
|
| 59 |
if not rag_chunks:
|
| 60 |
return ""
|
| 61 |
|
|
|
|
| 62 |
q_emb = get_embedding(question)
|
| 63 |
if q_emb is None:
|
| 64 |
return ""
|
| 65 |
|
|
|
|
| 66 |
scored = []
|
| 67 |
for item in rag_chunks:
|
| 68 |
emb = item.get("embedding")
|
|
@@ -75,8 +81,26 @@ def retrieve_relevant_chunks(
|
|
| 75 |
if not scored:
|
| 76 |
return ""
|
| 77 |
|
|
|
|
| 78 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
#
|
| 82 |
return "\n---\n".join(top_chunks)
|
|
|
|
| 55 |
"""
|
| 56 |
用 embedding 对当前问题做一次检索,从 rag_chunks 中找出最相关的 top_k 段落,
|
| 57 |
返回拼接后的文本,供 prompt 使用。
|
| 58 |
+
(增强版本:将检索内容记录到 LangSmith metadata)
|
| 59 |
"""
|
| 60 |
+
from langsmith import get_current_run
|
| 61 |
+
|
| 62 |
+
# 1) 空安全检查
|
| 63 |
if not rag_chunks:
|
| 64 |
return ""
|
| 65 |
|
| 66 |
+
# 2) 问题 embedding
|
| 67 |
q_emb = get_embedding(question)
|
| 68 |
if q_emb is None:
|
| 69 |
return ""
|
| 70 |
|
| 71 |
+
# 3) 计算相似度
|
| 72 |
scored = []
|
| 73 |
for item in rag_chunks:
|
| 74 |
emb = item.get("embedding")
|
|
|
|
| 81 |
if not scored:
|
| 82 |
return ""
|
| 83 |
|
| 84 |
+
# 4) 按相似度排序
|
| 85 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 86 |
+
top_items = scored[:top_k]
|
| 87 |
+
top_chunks = [t for _sim, t in top_items]
|
| 88 |
+
|
| 89 |
+
# 5) ⭐ 记录到 LangSmith(每个 chunk 的文本 + 相似度)
|
| 90 |
+
run = get_current_run()
|
| 91 |
+
if run:
|
| 92 |
+
run.update(
|
| 93 |
+
metadata={
|
| 94 |
+
"question": question,
|
| 95 |
+
"retrieved_chunks": [
|
| 96 |
+
{
|
| 97 |
+
"score": float(sim),
|
| 98 |
+
"text_preview": text[:300], # 避免 UI 太长,取前300字
|
| 99 |
+
}
|
| 100 |
+
for sim, text in top_items
|
| 101 |
+
]
|
| 102 |
+
}
|
| 103 |
+
)
|
| 104 |
|
| 105 |
+
# 6) 返回原格式的拼接结果
|
| 106 |
return "\n---\n".join(top_chunks)
|