Spaces:
Sleeping
Sleeping
Update rag_engine.py
Browse files- rag_engine.py +28 -15
rag_engine.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
# rag_engine.py
|
| 2 |
-
|
|
|
|
| 3 |
|
|
|
|
| 4 |
from clare_core import (
|
| 5 |
-
parse_syllabus_docx,
|
| 6 |
get_embedding,
|
| 7 |
cosine_similarity,
|
| 8 |
)
|
|
@@ -12,26 +13,38 @@ from langsmith.run_helpers import set_run_metadata
|
|
| 12 |
|
| 13 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
| 14 |
"""
|
| 15 |
-
从上传的文件构建 RAG chunk 列表(session 级别)
|
| 16 |
-
|
| 17 |
-
-
|
| 18 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
if file is None:
|
| 21 |
return []
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
try:
|
| 24 |
-
|
| 25 |
-
if
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
return []
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
paragraphs = parse_syllabus_docx(file_path, max_lines=100)
|
| 31 |
-
|
| 32 |
chunks: List[Dict] = []
|
| 33 |
-
for
|
| 34 |
-
text =
|
| 35 |
if not text:
|
| 36 |
continue
|
| 37 |
emb = get_embedding(text)
|
|
@@ -39,7 +52,7 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
|
| 39 |
continue
|
| 40 |
chunks.append({"text": text, "embedding": emb})
|
| 41 |
|
| 42 |
-
print(f"[RAG] built {len(chunks)} chunks from uploaded file")
|
| 43 |
return chunks
|
| 44 |
|
| 45 |
except Exception as e:
|
|
|
|
| 1 |
# rag_engine.py
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Dict
|
| 4 |
|
| 5 |
+
from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf
|
| 6 |
from clare_core import (
|
|
|
|
| 7 |
get_embedding,
|
| 8 |
cosine_similarity,
|
| 9 |
)
|
|
|
|
| 13 |
|
| 14 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
| 15 |
"""
|
| 16 |
+
从上传的文件构建 RAG chunk 列表(session 级别)。
|
| 17 |
+
|
| 18 |
+
- 支持 .docx 和 .pdf
|
| 19 |
+
- 复用 syllabus_utils 里的解析函数,把文档切成一系列文本块
|
| 20 |
+
- 对每个非空文本块做 embedding,存成 {"text": str, "embedding": List[float]}
|
| 21 |
+
|
| 22 |
+
当前 doc_type_val 主要用于未来扩展(不同类型文件可采用不同切块策略),
|
| 23 |
+
这里先不区分,统一按段落/块处理。
|
| 24 |
"""
|
| 25 |
if file is None:
|
| 26 |
return []
|
| 27 |
|
| 28 |
+
file_path = getattr(file, "name", None)
|
| 29 |
+
if not file_path:
|
| 30 |
+
return []
|
| 31 |
+
|
| 32 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 33 |
+
|
| 34 |
try:
|
| 35 |
+
# 1) 解析文件 → 得到一组文本块
|
| 36 |
+
if ext == ".docx":
|
| 37 |
+
texts = parse_syllabus_docx(file_path)
|
| 38 |
+
elif ext == ".pdf":
|
| 39 |
+
texts = parse_syllabus_pdf(file_path)
|
| 40 |
+
else:
|
| 41 |
+
print(f"[RAG] unsupported file type for RAG: {ext}")
|
| 42 |
return []
|
| 43 |
|
| 44 |
+
# 2) 对每个文本块做 embedding
|
|
|
|
|
|
|
| 45 |
chunks: List[Dict] = []
|
| 46 |
+
for t in texts:
|
| 47 |
+
text = t.strip()
|
| 48 |
if not text:
|
| 49 |
continue
|
| 50 |
emb = get_embedding(text)
|
|
|
|
| 52 |
continue
|
| 53 |
chunks.append({"text": text, "embedding": emb})
|
| 54 |
|
| 55 |
+
print(f"[RAG] built {len(chunks)} chunks from uploaded file ({ext}, doc_type={doc_type_val})")
|
| 56 |
return chunks
|
| 57 |
|
| 58 |
except Exception as e:
|