test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 5, 2025

Commit

ec485a2

verified ·

1 Parent(s): 9470e5c

Create syllabus_utils.py

Browse files

Files changed (1) hide show

syllabus_utils.py +92 -0

syllabus_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# syllabus_utils.py
+"""
+工具函数：
+- 解析 Syllabus（.docx / .pdf）
+- 提取课程大纲 topics
+"""
+import os
+from typing import List
+from docx import Document
+from pypdf import PdfReader
+from config import DEFAULT_COURSE_TOPICS
+def parse_syllabus_docx(path: str) -> List[str]:
+    """
+    从 .docx 文件中提取课程大纲。
+    这里做一个简单版本：按段落抽取，过滤空行。
+    后面如果你想按 Week 0/1/2 更精细地切，可以再优化这里。
+    """
+    doc = Document(path)
+    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
+    # 如果包含 "Week" 这样的前缀，用这些段落作为 topics
+    week_like = [p for p in paragraphs if p.lower().startswith("week ")]
+    if week_like:
+        return week_like
+    # 否则就截取前若干行当作大纲
+    return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
+def parse_syllabus_pdf(path: str) -> List[str]:
+    """
+    简单版 PDF 解析：
+    - 把所有页面的文本抽出来
+    - 按空行切成 chunk
+    - 返回非空 chunk 列表
+    """
+    reader = PdfReader(path)
+    pages_text = []
+    for page in reader.pages:
+        text = page.extract_text() or ""
+        if text.strip():
+            pages_text.append(text)
+    full_text = "\n".join(pages_text)
+    # 按空行分段
+    raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
+    chunks = [c for c in raw_chunks if c]
+    # 作为 syllabus 使用时，我们只取前若干段作为“课程大纲”
+    return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
+def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
+    """
+    根据上传文件和 doc_type 提取课程大纲 topics。
+    - 只有 doc_type == "Syllabus" 时才尝试从文件解析；否则用默认大纲。
+    - 支持 .docx + .pdf
+    """
+    if file_obj is None:
+        return DEFAULT_COURSE_TOPICS
+    if doc_type != "Syllabus":
+        # 不是 Syllabus，就不要动课程大纲（可以用默认）
+        return DEFAULT_COURSE_TOPICS
+    # Gradio File 对象通常有 .name 属性（临时文件路径）
+    file_path = getattr(file_obj, "name", None)
+    if not file_path:
+        return DEFAULT_COURSE_TOPICS
+    ext = os.path.splitext(file_path)[1].lower()
+    try:
+        if ext == ".docx":
+            topics = parse_syllabus_docx(file_path)
+        elif ext == ".pdf":
+            topics = parse_syllabus_pdf(file_path)
+        else:
+            print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
+            topics = DEFAULT_COURSE_TOPICS
+    except Exception as e:
+        print(f"[Syllabus] parse error: {repr(e)}")
+        topics = DEFAULT_COURSE_TOPICS
+    # 最后兜底，避免返回空列表
+    return topics or DEFAULT_COURSE_TOPICS