Spaces:
Sleeping
Sleeping
Update api/syllabus_utils.py
Browse files- api/syllabus_utils.py +9 -14
api/syllabus_utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
"""
|
| 2 |
工具函数:
|
| 3 |
- 解析 Syllabus(.docx / .pdf / .pptx)
|
|
@@ -11,33 +12,30 @@ from docx import Document
|
|
| 11 |
from pypdf import PdfReader
|
| 12 |
from pptx import Presentation # python-pptx
|
| 13 |
|
| 14 |
-
|
| 15 |
-
from .config import DEFAULT_COURSE_TOPICS
|
| 16 |
|
| 17 |
|
| 18 |
def parse_syllabus_docx(path: str) -> List[str]:
|
| 19 |
"""
|
| 20 |
从 .docx 文件中提取课程大纲。
|
| 21 |
-
|
| 22 |
"""
|
| 23 |
doc = Document(path)
|
| 24 |
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
|
| 25 |
|
| 26 |
-
# 如果包含 "Week" 这样的前缀,用这些段落作为 topics
|
| 27 |
week_like = [p for p in paragraphs if p.lower().startswith("week ")]
|
| 28 |
if week_like:
|
| 29 |
return week_like
|
| 30 |
|
| 31 |
-
# 否则就截取前若干行当作大纲
|
| 32 |
return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
|
| 33 |
|
| 34 |
|
| 35 |
def parse_syllabus_pdf(path: str) -> List[str]:
|
| 36 |
"""
|
| 37 |
简单版 PDF 解析:
|
| 38 |
-
-
|
| 39 |
-
-
|
| 40 |
-
-
|
| 41 |
"""
|
| 42 |
reader = PdfReader(path)
|
| 43 |
pages_text = []
|
|
@@ -48,19 +46,15 @@ def parse_syllabus_pdf(path: str) -> List[str]:
|
|
| 48 |
|
| 49 |
full_text = "\n".join(pages_text)
|
| 50 |
|
| 51 |
-
# 按空行分段
|
| 52 |
raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
|
| 53 |
chunks = [c for c in raw_chunks if c]
|
| 54 |
|
| 55 |
-
# 作为 syllabus 使用时,我们只取前若干段作为“课程大纲”
|
| 56 |
return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
|
| 57 |
|
| 58 |
|
| 59 |
def parse_pptx_slides(path: str) -> List[str]:
|
| 60 |
"""
|
| 61 |
-
从 .pptx 文件中抽取每一页 slide
|
| 62 |
-
- 每一页作为一个文本块(RAG 的一个 chunk)
|
| 63 |
-
- 只收集有文字的 shape
|
| 64 |
"""
|
| 65 |
prs = Presentation(path)
|
| 66 |
slide_texts: List[str] = []
|
|
@@ -82,7 +76,7 @@ def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
|
|
| 82 |
"""
|
| 83 |
根据上传文件和 doc_type 提取课程大纲 topics。
|
| 84 |
- 只有 doc_type == "Syllabus" 时才尝试从文件解析;否则用默认大纲。
|
| 85 |
-
- 支持 .docx
|
| 86 |
"""
|
| 87 |
if file_obj is None:
|
| 88 |
return DEFAULT_COURSE_TOPICS
|
|
@@ -102,6 +96,7 @@ def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
|
|
| 102 |
elif ext == ".pdf":
|
| 103 |
topics = parse_syllabus_pdf(file_path)
|
| 104 |
elif ext == ".pptx":
|
|
|
|
| 105 |
topics = parse_pptx_slides(file_path)
|
| 106 |
else:
|
| 107 |
print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
|
|
|
|
| 1 |
+
# api/syllabus_utils.py
|
| 2 |
"""
|
| 3 |
工具函数:
|
| 4 |
- 解析 Syllabus(.docx / .pdf / .pptx)
|
|
|
|
| 12 |
from pypdf import PdfReader
|
| 13 |
from pptx import Presentation # python-pptx
|
| 14 |
|
| 15 |
+
from api.config import DEFAULT_COURSE_TOPICS
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def parse_syllabus_docx(path: str) -> List[str]:
|
| 19 |
"""
|
| 20 |
从 .docx 文件中提取课程大纲。
|
| 21 |
+
简单版:按段落抽取,过滤空行;优先识别 Week 开头行。
|
| 22 |
"""
|
| 23 |
doc = Document(path)
|
| 24 |
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
|
| 25 |
|
|
|
|
| 26 |
week_like = [p for p in paragraphs if p.lower().startswith("week ")]
|
| 27 |
if week_like:
|
| 28 |
return week_like
|
| 29 |
|
|
|
|
| 30 |
return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
|
| 31 |
|
| 32 |
|
| 33 |
def parse_syllabus_pdf(path: str) -> List[str]:
|
| 34 |
"""
|
| 35 |
简单版 PDF 解析:
|
| 36 |
+
- 抽取所有页文本
|
| 37 |
+
- 按空行切段
|
| 38 |
+
- 返回前若干段作为“课程大纲 topics”
|
| 39 |
"""
|
| 40 |
reader = PdfReader(path)
|
| 41 |
pages_text = []
|
|
|
|
| 46 |
|
| 47 |
full_text = "\n".join(pages_text)
|
| 48 |
|
|
|
|
| 49 |
raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
|
| 50 |
chunks = [c for c in raw_chunks if c]
|
| 51 |
|
|
|
|
| 52 |
return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
|
| 53 |
|
| 54 |
|
| 55 |
def parse_pptx_slides(path: str) -> List[str]:
|
| 56 |
"""
|
| 57 |
+
从 .pptx 文件中抽取每一页 slide 的文本(每页一个块)。
|
|
|
|
|
|
|
| 58 |
"""
|
| 59 |
prs = Presentation(path)
|
| 60 |
slide_texts: List[str] = []
|
|
|
|
| 76 |
"""
|
| 77 |
根据上传文件和 doc_type 提取课程大纲 topics。
|
| 78 |
- 只有 doc_type == "Syllabus" 时才尝试从文件解析;否则用默认大纲。
|
| 79 |
+
- 支持 .docx / .pdf / .pptx
|
| 80 |
"""
|
| 81 |
if file_obj is None:
|
| 82 |
return DEFAULT_COURSE_TOPICS
|
|
|
|
| 96 |
elif ext == ".pdf":
|
| 97 |
topics = parse_syllabus_pdf(file_path)
|
| 98 |
elif ext == ".pptx":
|
| 99 |
+
# pptx 直接用 slides 文本作为 topics(也可以后续再做 “Week/Module” 提取)
|
| 100 |
topics = parse_pptx_slides(file_path)
|
| 101 |
else:
|
| 102 |
print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
|