Spaces:
Sleeping
Sleeping
Update rag_engine.py
Browse files- rag_engine.py +15 -22
rag_engine.py
CHANGED
|
@@ -7,6 +7,7 @@ from clare_core import (
|
|
| 7 |
cosine_similarity,
|
| 8 |
)
|
| 9 |
from langsmith import traceable
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
|
@@ -57,18 +58,13 @@ def retrieve_relevant_chunks(
|
|
| 57 |
返回拼接后的文本,供 prompt 使用。
|
| 58 |
(增强版本:将检索内容记录到 LangSmith metadata)
|
| 59 |
"""
|
| 60 |
-
from langsmith import get_current_run
|
| 61 |
-
|
| 62 |
-
# 1) 空安全检查
|
| 63 |
if not rag_chunks:
|
| 64 |
return ""
|
| 65 |
|
| 66 |
-
# 2) 问题 embedding
|
| 67 |
q_emb = get_embedding(question)
|
| 68 |
if q_emb is None:
|
| 69 |
return ""
|
| 70 |
|
| 71 |
-
# 3) 计算相似度
|
| 72 |
scored = []
|
| 73 |
for item in rag_chunks:
|
| 74 |
emb = item.get("embedding")
|
|
@@ -81,26 +77,23 @@ def retrieve_relevant_chunks(
|
|
| 81 |
if not scored:
|
| 82 |
return ""
|
| 83 |
|
| 84 |
-
# 4) 按相似度排序
|
| 85 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 86 |
top_items = scored[:top_k]
|
| 87 |
top_chunks = [t for _sim, t in top_items]
|
| 88 |
|
| 89 |
-
#
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
"text_preview": text[:300], # 避免 UI 太长,取前300字
|
| 99 |
-
}
|
| 100 |
-
for sim, text in top_items
|
| 101 |
-
]
|
| 102 |
-
}
|
| 103 |
)
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
return "\n---\n".join(top_chunks)
|
|
|
|
| 7 |
cosine_similarity,
|
| 8 |
)
|
| 9 |
from langsmith import traceable
|
| 10 |
+
from langsmith.run_helpers import set_run_metadata
|
| 11 |
|
| 12 |
|
| 13 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
|
|
|
| 58 |
返回拼接后的文本,供 prompt 使用。
|
| 59 |
(增强版本:将检索内容记录到 LangSmith metadata)
|
| 60 |
"""
|
|
|
|
|
|
|
|
|
|
| 61 |
if not rag_chunks:
|
| 62 |
return ""
|
| 63 |
|
|
|
|
| 64 |
q_emb = get_embedding(question)
|
| 65 |
if q_emb is None:
|
| 66 |
return ""
|
| 67 |
|
|
|
|
| 68 |
scored = []
|
| 69 |
for item in rag_chunks:
|
| 70 |
emb = item.get("embedding")
|
|
|
|
| 77 |
if not scored:
|
| 78 |
return ""
|
| 79 |
|
|
|
|
| 80 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 81 |
top_items = scored[:top_k]
|
| 82 |
top_chunks = [t for _sim, t in top_items]
|
| 83 |
|
| 84 |
+
# 使用 set_run_metadata 给当前 retriever run 打 metadata
|
| 85 |
+
try:
|
| 86 |
+
previews = [
|
| 87 |
+
{"score": float(sim), "text_preview": text[:300]}
|
| 88 |
+
for sim, text in top_items
|
| 89 |
+
]
|
| 90 |
+
set_run_metadata(
|
| 91 |
+
question=question,
|
| 92 |
+
retrieved_chunks=previews,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
# observability 出错不能影响主流程
|
| 96 |
+
print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
|
| 97 |
|
| 98 |
+
# 用分隔线拼接,方便模型辨认不同片段
|
| 99 |
+
return "\n---\n".join(top_chunks)
|