File size: 3,372 Bytes
834218c
ec485a2
 
c51223f
ec485a2
 
 
17a4c7b
 
ec485a2
 
 
 
 
c51223f
ec485a2
834218c
ec485a2
 
 
 
 
834218c
ec485a2
 
 
 
 
 
 
 
 
 
 
 
 
 
834218c
 
 
ec485a2
 
224ad70
 
ec485a2
 
 
 
 
 
 
 
 
 
 
 
 
d2891a7
 
834218c
d2891a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec485a2
 
 
224ad70
834218c
ec485a2
 
 
 
9cc2a58
 
ec485a2
 
224ad70
 
 
 
ec485a2
 
 
 
 
 
 
 
 
c51223f
 
ec485a2
 
 
 
 
 
 
c51223f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# api/syllabus_utils.py
"""
工具函数:
- 解析 Syllabus(.docx / .pdf / .pptx)
- 提取课程大纲 topics
"""

from __future__ import annotations

import os
from typing import List

from docx import Document
from pypdf import PdfReader
from pptx import Presentation  # python-pptx

from api.config import DEFAULT_COURSE_TOPICS


def parse_syllabus_docx(path: str) -> List[str]:
    """
    从 .docx 文件中提取课程大纲。
    简单版:按段落抽取,过滤空行;优先识别 Week 开头行。
    """
    doc = Document(path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]

    week_like = [p for p in paragraphs if p.lower().startswith("week ")]
    if week_like:
        return week_like

    return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS


def parse_syllabus_pdf(path: str) -> List[str]:
    """
    简单版 PDF 解析:
    - 抽取所有页文本
    - 按空行切段
    - 返回前若干段作为“课程大纲 topics”
    """
    reader = PdfReader(path)
    pages_text: List[str] = []

    for page in reader.pages:
        text = page.extract_text() or ""
        if text.strip():
            pages_text.append(text)

    full_text = "\n".join(pages_text)

    raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
    chunks = [c for c in raw_chunks if c]

    return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS


def parse_pptx_slides(path: str) -> List[str]:
    """
    从 .pptx 文件中抽取每一页 slide 的文本(每页一个块)。
    """
    prs = Presentation(path)
    slide_texts: List[str] = []

    for slide in prs.slides:
        lines: List[str] = []
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text:
                txt = shape.text.strip()
                if txt:
                    lines.append(txt)
        if lines:
            slide_texts.append("\n".join(lines))

    return slide_texts


def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
    """
    根据上传文件和 doc_type 提取课程大纲 topics。
    - 只有 doc_type == "syllabus" 时才尝试从文件解析;否则用默认大纲。
    - 支持 .docx / .pdf / .pptx
    """
    if file_obj is None:
        return DEFAULT_COURSE_TOPICS

    doc_type_norm = (doc_type or "").strip().lower()
    if doc_type_norm != "syllabus":
        return DEFAULT_COURSE_TOPICS

    # 这里必须是“真实可读路径”,你的 server.py 会传 fo.name = /tmp/xxx
    file_path = getattr(file_obj, "name", None)
    if not file_path or not os.path.exists(file_path):
        print(f"[Syllabus] file path missing or not found: {file_path!r}")
        return DEFAULT_COURSE_TOPICS

    ext = os.path.splitext(file_path)[1].lower()

    try:
        if ext == ".docx":
            topics = parse_syllabus_docx(file_path)
        elif ext == ".pdf":
            topics = parse_syllabus_pdf(file_path)
        elif ext == ".pptx":
            topics = parse_pptx_slides(file_path)
        else:
            print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
            topics = DEFAULT_COURSE_TOPICS
    except Exception as e:
        print(f"[Syllabus] parse error: {repr(e)}")
        topics = DEFAULT_COURSE_TOPICS

    return topics or DEFAULT_COURSE_TOPICS