SarahXia0405 commited on
Commit
ddeab24
·
verified ·
1 Parent(s): e285f2a

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +28 -15
rag_engine.py CHANGED
@@ -1,8 +1,9 @@
1
  # rag_engine.py
2
- from typing import List, Dict, Optional
 
3
 
 
4
  from clare_core import (
5
- parse_syllabus_docx,
6
  get_embedding,
7
  cosine_similarity,
8
  )
@@ -12,26 +13,38 @@ from langsmith.run_helpers import set_run_metadata
12
 
13
  def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
14
  """
15
- 从上传的文件构建 RAG chunk 列表(session 级别)
16
- - 目前只支持 .docx
17
- - 使用 parse_syllabus_docx 把文档按段落切片
18
- - 对每个非空段落做 embedding {"text": str, "embedding": List[float]}
 
 
 
 
19
  """
20
  if file is None:
21
  return []
22
 
 
 
 
 
 
 
23
  try:
24
- file_path = file.name
25
- if not file_path.lower().endswith(".docx"):
26
- # 目前先只支持 docx,后面可以扩展 pdf / txt
 
 
 
 
27
  return []
28
 
29
- # 多取一些行,比课程大纲用的 15 更长
30
- paragraphs = parse_syllabus_docx(file_path, max_lines=100)
31
-
32
  chunks: List[Dict] = []
33
- for para in paragraphs:
34
- text = para.strip()
35
  if not text:
36
  continue
37
  emb = get_embedding(text)
@@ -39,7 +52,7 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
39
  continue
40
  chunks.append({"text": text, "embedding": emb})
41
 
42
- print(f"[RAG] built {len(chunks)} chunks from uploaded file")
43
  return chunks
44
 
45
  except Exception as e:
 
1
  # rag_engine.py
2
+ import os
3
+ from typing import List, Dict
4
 
5
+ from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf
6
  from clare_core import (
 
7
  get_embedding,
8
  cosine_similarity,
9
  )
 
13
 
14
  def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
15
  """
16
+ 从上传的文件构建 RAG chunk 列表(session 级别)
17
+
18
+ - 支持 .docx 和 .pdf
19
+ - 复用 syllabus_utils 里的解析函数把文档切一系列文本块
20
+ - 对每个非空文本块做 embedding,存成 {"text": str, "embedding": List[float]}
21
+
22
+ 当前 doc_type_val 主要用于未来扩展(不同类型文件可采用不同切块策略),
23
+ 这里先不区分,统一按段落/块处理。
24
  """
25
  if file is None:
26
  return []
27
 
28
+ file_path = getattr(file, "name", None)
29
+ if not file_path:
30
+ return []
31
+
32
+ ext = os.path.splitext(file_path)[1].lower()
33
+
34
  try:
35
+ # 1) 解析文件 → 得到一组文本块
36
+ if ext == ".docx":
37
+ texts = parse_syllabus_docx(file_path)
38
+ elif ext == ".pdf":
39
+ texts = parse_syllabus_pdf(file_path)
40
+ else:
41
+ print(f"[RAG] unsupported file type for RAG: {ext}")
42
  return []
43
 
44
+ # 2) 对每个文本块做 embedding
 
 
45
  chunks: List[Dict] = []
46
+ for t in texts:
47
+ text = t.strip()
48
  if not text:
49
  continue
50
  emb = get_embedding(text)
 
52
  continue
53
  chunks.append({"text": text, "embedding": emb})
54
 
55
+ print(f"[RAG] built {len(chunks)} chunks from uploaded file ({ext}, doc_type={doc_type_val})")
56
  return chunks
57
 
58
  except Exception as e: