test_AI_Agent

Sleeping

SarahXia0405 commited on Dec 5, 2025

Commit

d2891a7

verified ·

1 Parent(s): 22a9a4a

Update syllabus_utils.py

Files changed (1) hide show

syllabus_utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # syllabus_utils.py
 """
 工具函数：
-- 解析 Syllabus（.docx / .pdf）
 - 提取课程大纲 topics
 """
@@ -56,6 +56,30 @@ def parse_syllabus_pdf(path: str) -> List[str]:
     return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
 def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
     """
     根据上传文件和 doc_type 提取课程大纲 topics。

 # syllabus_utils.py
 """
 工具函数：
+- 解析 Syllabus（.docx / .pdf/ .pptx）
 - 提取课程大纲 topics
 """
     return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
+from pptx import Presentation  #  python-pptx
+def parse_pptx_slides(path: str) -> List[str]:
+    """
+    从 .pptx 文件中抽取每一页 slide 的文本，返回一个字符串列表。
+    - 每一页作为一个文本块（RAG 的一个 chunk）
+    - 只收集有文字的 shape
+    """
+    prs = Presentation(path)
+    slide_texts: List[str] = []
+    for slide in prs.slides:
+        lines: List[str] = []
+        for shape in slide.shapes:
+            if hasattr(shape, "text") and shape.text:
+                txt = shape.text.strip()
+                if txt:
+                    lines.append(txt)
+        if lines:
+            slide_texts.append("\n".join(lines))
+    return slide_texts
 def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
     """
     根据上传文件和 doc_type 提取课程大纲 topics。