# api/syllabus_utils.py """ 工具函数: - 解析 Syllabus(.docx / .pdf / .pptx) - 提取课程大纲 topics """ from __future__ import annotations import os from typing import List from docx import Document from pypdf import PdfReader from pptx import Presentation # python-pptx from api.config import DEFAULT_COURSE_TOPICS def parse_syllabus_docx(path: str) -> List[str]: """ 从 .docx 文件中提取课程大纲。 简单版:按段落抽取,过滤空行;优先识别 Week 开头行。 """ doc = Document(path) paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()] week_like = [p for p in paragraphs if p.lower().startswith("week ")] if week_like: return week_like return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS def parse_syllabus_pdf(path: str) -> List[str]: """ 简单版 PDF 解析: - 抽取所有页文本 - 按空行切段 - 返回前若干段作为“课程大纲 topics” """ reader = PdfReader(path) pages_text: List[str] = [] for page in reader.pages: text = page.extract_text() or "" if text.strip(): pages_text.append(text) full_text = "\n".join(pages_text) raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")] chunks = [c for c in raw_chunks if c] return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS def parse_pptx_slides(path: str) -> List[str]: """ 从 .pptx 文件中抽取每一页 slide 的文本(每页一个块)。 """ prs = Presentation(path) slide_texts: List[str] = [] for slide in prs.slides: lines: List[str] = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text: txt = shape.text.strip() if txt: lines.append(txt) if lines: slide_texts.append("\n".join(lines)) return slide_texts def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]: """ 根据上传文件和 doc_type 提取课程大纲 topics。 - 只有 doc_type == "syllabus" 时才尝试从文件解析;否则用默认大纲。 - 支持 .docx / .pdf / .pptx """ if file_obj is None: return DEFAULT_COURSE_TOPICS doc_type_norm = (doc_type or "").strip().lower() if doc_type_norm != "syllabus": return DEFAULT_COURSE_TOPICS # 这里必须是“真实可读路径”,你的 server.py 会传 fo.name = /tmp/xxx file_path = getattr(file_obj, "name", None) if not file_path or not os.path.exists(file_path): print(f"[Syllabus] file path missing or not found: {file_path!r}") return DEFAULT_COURSE_TOPICS ext = os.path.splitext(file_path)[1].lower() try: if ext == ".docx": topics = parse_syllabus_docx(file_path) elif ext == ".pdf": topics = parse_syllabus_pdf(file_path) elif ext == ".pptx": topics = parse_pptx_slides(file_path) else: print(f"[Syllabus] Unsupported file type for syllabus: {ext}") topics = DEFAULT_COURSE_TOPICS except Exception as e: print(f"[Syllabus] parse error: {repr(e)}") topics = DEFAULT_COURSE_TOPICS return topics or DEFAULT_COURSE_TOPICS