Spaces:
Sleeping
Sleeping
File size: 3,372 Bytes
834218c ec485a2 c51223f ec485a2 17a4c7b ec485a2 c51223f ec485a2 834218c ec485a2 834218c ec485a2 834218c ec485a2 224ad70 ec485a2 d2891a7 834218c d2891a7 ec485a2 224ad70 834218c ec485a2 9cc2a58 ec485a2 224ad70 ec485a2 c51223f ec485a2 c51223f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# api/syllabus_utils.py
"""
工具函数:
- 解析 Syllabus(.docx / .pdf / .pptx)
- 提取课程大纲 topics
"""
from __future__ import annotations
import os
from typing import List
from docx import Document
from pypdf import PdfReader
from pptx import Presentation # python-pptx
from api.config import DEFAULT_COURSE_TOPICS
def parse_syllabus_docx(path: str) -> List[str]:
"""
从 .docx 文件中提取课程大纲。
简单版:按段落抽取,过滤空行;优先识别 Week 开头行。
"""
doc = Document(path)
paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
week_like = [p for p in paragraphs if p.lower().startswith("week ")]
if week_like:
return week_like
return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
def parse_syllabus_pdf(path: str) -> List[str]:
"""
简单版 PDF 解析:
- 抽取所有页文本
- 按空行切段
- 返回前若干段作为“课程大纲 topics”
"""
reader = PdfReader(path)
pages_text: List[str] = []
for page in reader.pages:
text = page.extract_text() or ""
if text.strip():
pages_text.append(text)
full_text = "\n".join(pages_text)
raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
chunks = [c for c in raw_chunks if c]
return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS
def parse_pptx_slides(path: str) -> List[str]:
"""
从 .pptx 文件中抽取每一页 slide 的文本(每页一个块)。
"""
prs = Presentation(path)
slide_texts: List[str] = []
for slide in prs.slides:
lines: List[str] = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text:
txt = shape.text.strip()
if txt:
lines.append(txt)
if lines:
slide_texts.append("\n".join(lines))
return slide_texts
def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
"""
根据上传文件和 doc_type 提取课程大纲 topics。
- 只有 doc_type == "syllabus" 时才尝试从文件解析;否则用默认大纲。
- 支持 .docx / .pdf / .pptx
"""
if file_obj is None:
return DEFAULT_COURSE_TOPICS
doc_type_norm = (doc_type or "").strip().lower()
if doc_type_norm != "syllabus":
return DEFAULT_COURSE_TOPICS
# 这里必须是“真实可读路径”,你的 server.py 会传 fo.name = /tmp/xxx
file_path = getattr(file_obj, "name", None)
if not file_path or not os.path.exists(file_path):
print(f"[Syllabus] file path missing or not found: {file_path!r}")
return DEFAULT_COURSE_TOPICS
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == ".docx":
topics = parse_syllabus_docx(file_path)
elif ext == ".pdf":
topics = parse_syllabus_pdf(file_path)
elif ext == ".pptx":
topics = parse_pptx_slides(file_path)
else:
print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
topics = DEFAULT_COURSE_TOPICS
except Exception as e:
print(f"[Syllabus] parse error: {repr(e)}")
topics = DEFAULT_COURSE_TOPICS
return topics or DEFAULT_COURSE_TOPICS
|