Spaces:
Sleeping
Sleeping
Update rag_engine.py
Browse files- rag_engine.py +5 -6
rag_engine.py
CHANGED
|
@@ -9,18 +9,15 @@ from clare_core import (
|
|
| 9 |
)
|
| 10 |
from langsmith import traceable
|
| 11 |
from langsmith.run_helpers import set_run_metadata
|
| 12 |
-
|
| 13 |
|
| 14 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
| 15 |
"""
|
| 16 |
从上传的文件构建 RAG chunk 列表(session 级别)。
|
| 17 |
|
| 18 |
-
- 支持 .docx
|
| 19 |
- 复用 syllabus_utils 里的解析函数,把文档切成一系列文本块
|
| 20 |
- 对每个非空文本块做 embedding,存成 {"text": str, "embedding": List[float]}
|
| 21 |
-
|
| 22 |
-
当前 doc_type_val 主要用于未来扩展(不同类型文件可采用不同切块策略),
|
| 23 |
-
这里先不区分,统一按段落/块处理。
|
| 24 |
"""
|
| 25 |
if file is None:
|
| 26 |
return []
|
|
@@ -32,11 +29,13 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
|
| 32 |
ext = os.path.splitext(file_path)[1].lower()
|
| 33 |
|
| 34 |
try:
|
| 35 |
-
# 1) 解析文件 →
|
| 36 |
if ext == ".docx":
|
| 37 |
texts = parse_syllabus_docx(file_path)
|
| 38 |
elif ext == ".pdf":
|
| 39 |
texts = parse_syllabus_pdf(file_path)
|
|
|
|
|
|
|
| 40 |
else:
|
| 41 |
print(f"[RAG] unsupported file type for RAG: {ext}")
|
| 42 |
return []
|
|
|
|
| 9 |
)
|
| 10 |
from langsmith import traceable
|
| 11 |
from langsmith.run_helpers import set_run_metadata
|
| 12 |
+
from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf, parse_pptx_slides
|
| 13 |
|
| 14 |
def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
|
| 15 |
"""
|
| 16 |
从上传的文件构建 RAG chunk 列表(session 级别)。
|
| 17 |
|
| 18 |
+
- 支持 .docx / .pdf / .pptx
|
| 19 |
- 复用 syllabus_utils 里的解析函数,把文档切成一系列文本块
|
| 20 |
- 对每个非空文本块做 embedding,存成 {"text": str, "embedding": List[float]}
|
|
|
|
|
|
|
|
|
|
| 21 |
"""
|
| 22 |
if file is None:
|
| 23 |
return []
|
|
|
|
| 29 |
ext = os.path.splitext(file_path)[1].lower()
|
| 30 |
|
| 31 |
try:
|
| 32 |
+
# 1) 解析文件 → 文本块列表
|
| 33 |
if ext == ".docx":
|
| 34 |
texts = parse_syllabus_docx(file_path)
|
| 35 |
elif ext == ".pdf":
|
| 36 |
texts = parse_syllabus_pdf(file_path)
|
| 37 |
+
elif ext == ".pptx":
|
| 38 |
+
texts = parse_pptx_slides(file_path)
|
| 39 |
else:
|
| 40 |
print(f"[RAG] unsupported file type for RAG: {ext}")
|
| 41 |
return []
|