Spaces:
Sleeping
Sleeping
| # api/syllabus_utils.py | |
| """ | |
| 工具函数: | |
| - 解析 Syllabus(.docx / .pdf / .pptx) | |
| - 提取课程大纲 topics | |
| """ | |
| from __future__ import annotations | |
| import os | |
| from typing import List | |
| from docx import Document | |
| from pypdf import PdfReader | |
| from pptx import Presentation # python-pptx | |
| from api.config import DEFAULT_COURSE_TOPICS | |
| def parse_syllabus_docx(path: str) -> List[str]: | |
| """ | |
| 从 .docx 文件中提取课程大纲。 | |
| 简单版:按段落抽取,过滤空行;优先识别 Week 开头行。 | |
| """ | |
| doc = Document(path) | |
| paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()] | |
| week_like = [p for p in paragraphs if p.lower().startswith("week ")] | |
| if week_like: | |
| return week_like | |
| return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS | |
| def parse_syllabus_pdf(path: str) -> List[str]: | |
| """ | |
| 简单版 PDF 解析: | |
| - 抽取所有页文本 | |
| - 按空行切段 | |
| - 返回前若干段作为“课程大纲 topics” | |
| """ | |
| reader = PdfReader(path) | |
| pages_text: List[str] = [] | |
| for page in reader.pages: | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| pages_text.append(text) | |
| full_text = "\n".join(pages_text) | |
| raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")] | |
| chunks = [c for c in raw_chunks if c] | |
| return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS | |
| def parse_pptx_slides(path: str) -> List[str]: | |
| """ | |
| 从 .pptx 文件中抽取每一页 slide 的文本(每页一个块)。 | |
| """ | |
| prs = Presentation(path) | |
| slide_texts: List[str] = [] | |
| for slide in prs.slides: | |
| lines: List[str] = [] | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text") and shape.text: | |
| txt = shape.text.strip() | |
| if txt: | |
| lines.append(txt) | |
| if lines: | |
| slide_texts.append("\n".join(lines)) | |
| return slide_texts | |
| def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]: | |
| """ | |
| 根据上传文件和 doc_type 提取课程大纲 topics。 | |
| - 只有 doc_type == "syllabus" 时才尝试从文件解析;否则用默认大纲。 | |
| - 支持 .docx / .pdf / .pptx | |
| """ | |
| if file_obj is None: | |
| return DEFAULT_COURSE_TOPICS | |
| doc_type_norm = (doc_type or "").strip().lower() | |
| if doc_type_norm != "syllabus": | |
| return DEFAULT_COURSE_TOPICS | |
| # 这里必须是“真实可读路径”,你的 server.py 会传 fo.name = /tmp/xxx | |
| file_path = getattr(file_obj, "name", None) | |
| if not file_path or not os.path.exists(file_path): | |
| print(f"[Syllabus] file path missing or not found: {file_path!r}") | |
| return DEFAULT_COURSE_TOPICS | |
| ext = os.path.splitext(file_path)[1].lower() | |
| try: | |
| if ext == ".docx": | |
| topics = parse_syllabus_docx(file_path) | |
| elif ext == ".pdf": | |
| topics = parse_syllabus_pdf(file_path) | |
| elif ext == ".pptx": | |
| topics = parse_pptx_slides(file_path) | |
| else: | |
| print(f"[Syllabus] Unsupported file type for syllabus: {ext}") | |
| topics = DEFAULT_COURSE_TOPICS | |
| except Exception as e: | |
| print(f"[Syllabus] parse error: {repr(e)}") | |
| topics = DEFAULT_COURSE_TOPICS | |
| return topics or DEFAULT_COURSE_TOPICS | |