Spaces:
Sleeping
Sleeping
| # syllabus_utils.py | |
| """ | |
| 工具函数: | |
| - 解析 Syllabus(.docx / .pdf/ .pptx) | |
| - 提取课程大纲 topics | |
| """ | |
| import os | |
| from typing import List | |
| from docx import Document | |
| from pypdf import PdfReader | |
| from config import DEFAULT_COURSE_TOPICS | |
| def parse_syllabus_docx(path: str) -> List[str]: | |
| """ | |
| 从 .docx 文件中提取课程大纲。 | |
| 这里做一个简单版本:按段落抽取,过滤空行。 | |
| 后面如果你想按 Week 0/1/2 更精细地切,可以再优化这里。 | |
| """ | |
| doc = Document(path) | |
| paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()] | |
| # 如果包含 "Week" 这样的前缀,用这些段落作为 topics | |
| week_like = [p for p in paragraphs if p.lower().startswith("week ")] | |
| if week_like: | |
| return week_like | |
| # 否则就截取前若干行当作大纲 | |
| return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS | |
| def parse_syllabus_pdf(path: str) -> List[str]: | |
| """ | |
| 简单版 PDF 解析: | |
| - 把所有页面的文本抽出来 | |
| - 按空行切成 chunk | |
| - 返回非空 chunk 列表 | |
| """ | |
| reader = PdfReader(path) | |
| pages_text = [] | |
| for page in reader.pages: | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| pages_text.append(text) | |
| full_text = "\n".join(pages_text) | |
| # 按空行分段 | |
| raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")] | |
| chunks = [c for c in raw_chunks if c] | |
| # 作为 syllabus 使用时,我们只取前若干段作为“课程大纲” | |
| return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS | |
| from pptx import Presentation # python-pptx | |
| def parse_pptx_slides(path: str) -> List[str]: | |
| """ | |
| 从 .pptx 文件中抽取每一页 slide 的文本,返回一个字符串列表。 | |
| - 每一页作为一个文本块(RAG 的一个 chunk) | |
| - 只收集有文字的 shape | |
| """ | |
| prs = Presentation(path) | |
| slide_texts: List[str] = [] | |
| for slide in prs.slides: | |
| lines: List[str] = [] | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text") and shape.text: | |
| txt = shape.text.strip() | |
| if txt: | |
| lines.append(txt) | |
| if lines: | |
| slide_texts.append("\n".join(lines)) | |
| return slide_texts | |
| def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]: | |
| """ | |
| 根据上传文件和 doc_type 提取课程大纲 topics。 | |
| - 只有 doc_type == "Syllabus" 时才尝试从文件解析;否则用默认大纲。 | |
| - 支持 .docx + .pdf | |
| """ | |
| if file_obj is None: | |
| return DEFAULT_COURSE_TOPICS | |
| if doc_type != "Syllabus": | |
| # 不是 Syllabus,就不要动课程大纲(可以用默认) | |
| return DEFAULT_COURSE_TOPICS | |
| # Gradio File 对象通常有 .name 属性(临时文件路径) | |
| file_path = getattr(file_obj, "name", None) | |
| if not file_path: | |
| return DEFAULT_COURSE_TOPICS | |
| ext = os.path.splitext(file_path)[1].lower() | |
| try: | |
| if ext == ".docx": | |
| topics = parse_syllabus_docx(file_path) | |
| elif ext == ".pdf": | |
| topics = parse_syllabus_pdf(file_path) | |
| else: | |
| print(f"[Syllabus] Unsupported file type for syllabus: {ext}") | |
| topics = DEFAULT_COURSE_TOPICS | |
| except Exception as e: | |
| print(f"[Syllabus] parse error: {repr(e)}") | |
| topics = DEFAULT_COURSE_TOPICS | |
| # 最后兜底,避免返回空列表 | |
| return topics or DEFAULT_COURSE_TOPICS |