test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 19, 2025

Commit

834218c

verified ·

1 Parent(s): c5f1e41

Update api/syllabus_utils.py

Browse files

Files changed (1) hide show

api/syllabus_utils.py +9 -14

api/syllabus_utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
 """
 工具函数：
 - 解析 Syllabus（.docx / .pdf / .pptx）
@@ -11,33 +12,30 @@ from docx import Document
 from pypdf import PdfReader
 from pptx import Presentation  # python-pptx
-# 关键修复：从 api 包内导入 config
-from .config import DEFAULT_COURSE_TOPICS
 def parse_syllabus_docx(path: str) -> List[str]:
     """
     从 .docx 文件中提取课程大纲。
-    简单版本：按段落抽取，过滤空行。
     """
     doc = Document(path)
     paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
-    # 如果包含 "Week" 这样的前缀，用这些段落作为 topics
     week_like = [p for p in paragraphs if p.lower().startswith("week ")]
     if week_like:
         return week_like
-    # 否则就截取前若干行当作大纲
     return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
 def parse_syllabus_pdf(path: str) -> List[str]:
     """
     简单版 PDF 解析：
-    - 把所有页面的文本抽出来
-    - 按空行切成 chunk
-    - 返回非空 chunk 列表
     """
     reader = PdfReader(path)
     pages_text = []
@@ -48,19 +46,15 @@ def parse_syllabus_pdf(path: str) -> List[str]:
     full_text = "\n".join(pages_text)
-    # 按空行分段
     raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
     chunks = [c for c in raw_chunks if c]
-    # 作为 syllabus 使用时，我们只取前若干段作为“课程大纲”
     return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
 def parse_pptx_slides(path: str) -> List[str]:
     """
-    从 .pptx 文件中抽取每一页 slide 的文本，返回一个字符串列表。
-    - 每一页作为一个文本块（RAG 的一个 chunk）
-    - 只收集有文字的 shape
     """
     prs = Presentation(path)
     slide_texts: List[str] = []
@@ -82,7 +76,7 @@ def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
     """
     根据上传文件和 doc_type 提取课程大纲 topics。
     - 只有 doc_type == "Syllabus" 时才尝试从文件解析；否则用默认大纲。
-    - 支持 .docx + .pdf + .pptx
     """
     if file_obj is None:
         return DEFAULT_COURSE_TOPICS
@@ -102,6 +96,7 @@ def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
         elif ext == ".pdf":
             topics = parse_syllabus_pdf(file_path)
         elif ext == ".pptx":
             topics = parse_pptx_slides(file_path)
         else:
             print(f"[Syllabus] Unsupported file type for syllabus: {ext}")

+# api/syllabus_utils.py
 """
 工具函数：
 - 解析 Syllabus（.docx / .pdf / .pptx）
 from pypdf import PdfReader
 from pptx import Presentation  # python-pptx
+from api.config import DEFAULT_COURSE_TOPICS
 def parse_syllabus_docx(path: str) -> List[str]:
     """
     从 .docx 文件中提取课程大纲。
+    简单版：按段落抽取，过滤空行；优先识别 Week 开头行。
     """
     doc = Document(path)
     paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
     week_like = [p for p in paragraphs if p.lower().startswith("week ")]
     if week_like:
         return week_like
     return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
 def parse_syllabus_pdf(path: str) -> List[str]:
     """
     简单版 PDF 解析：
+    - 抽取所有页文本
+    - 按空行切段
+    - 返回前若干段作为“课程大纲 topics”
     """
     reader = PdfReader(path)
     pages_text = []
     full_text = "\n".join(pages_text)
     raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
     chunks = [c for c in raw_chunks if c]
     return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
 def parse_pptx_slides(path: str) -> List[str]:
     """
+    从 .pptx 文件中抽取每一页 slide 的文本（每页一个块）。
     """
     prs = Presentation(path)
     slide_texts: List[str] = []
     """
     根据上传文件和 doc_type 提取课程大纲 topics。
     - 只有 doc_type == "Syllabus" 时才尝试从文件解析；否则用默认大纲。
+    - 支持 .docx / .pdf / .pptx
     """
     if file_obj is None:
         return DEFAULT_COURSE_TOPICS
         elif ext == ".pdf":
             topics = parse_syllabus_pdf(file_path)
         elif ext == ".pptx":
+            # pptx 直接用 slides 文本作为 topics（也可以后续再做 “Week/Module” 提取）
             topics = parse_pptx_slides(file_path)
         else:
             print(f"[Syllabus] Unsupported file type for syllabus: {ext}")