test_AI_Agent

Sleeping

App Files Files Community

test_AI_Agent / api /syllabus_utils.py

SarahXia0405

Rename syllabus_utils.py to api/syllabus_utils.py

ffa8016 verified about 2 months ago

raw

history blame

3.55 kB

	# syllabus_utils.py
	"""
	工具函数：
	- 解析 Syllabus（.docx / .pdf/ .pptx）
	- 提取课程大纲 topics
	"""

	import os
	from typing import List

	from docx import Document
	from pypdf import PdfReader

	from config import DEFAULT_COURSE_TOPICS


	def parse_syllabus_docx(path: str) -> List[str]:
	"""
	从 .docx 文件中提取课程大纲。
	这里做一个简单版本：按段落抽取，过滤空行。
	后面如果你想按 Week 0/1/2 更精细地切，可以再优化这里。
	"""
	doc = Document(path)
	paragraphs = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]

	# 如果包含 "Week" 这样的前缀，用这些段落作为 topics
	week_like = [p for p in paragraphs if p.lower().startswith("week ")]
	if week_like:
	return week_like

	# 否则就截取前若干行当作大纲
	return paragraphs[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS


	def parse_syllabus_pdf(path: str) -> List[str]:
	"""
	简单版 PDF 解析：
	- 把所有页面的文本抽出来
	- 按空行切成 chunk
	- 返回非空 chunk 列表
	"""
	reader = PdfReader(path)
	pages_text = []
	for page in reader.pages:
	text = page.extract_text() or ""
	if text.strip():
	pages_text.append(text)

	full_text = "\n".join(pages_text)

	# 按空行分段
	raw_chunks = [chunk.strip() for chunk in full_text.split("\n\n")]
	chunks = [c for c in raw_chunks if c]

	# 作为 syllabus 使用时，我们只取前若干段作为“课程大纲”
	return chunks[: len(DEFAULT_COURSE_TOPICS)] or DEFAULT_COURSE_TOPICS


	from pptx import Presentation # python-pptx

	def parse_pptx_slides(path: str) -> List[str]:
	"""
	从 .pptx 文件中抽取每一页 slide 的文本，返回一个字符串列表。
	- 每一页作为一个文本块（RAG 的一个 chunk）
	- 只收集有文字的 shape
	"""
	prs = Presentation(path)
	slide_texts: List[str] = []

	for slide in prs.slides:
	lines: List[str] = []
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text:
	txt = shape.text.strip()
	if txt:
	lines.append(txt)
	if lines:
	slide_texts.append("\n".join(lines))

	return slide_texts


	def extract_course_topics_from_file(file_obj, doc_type: str) -> List[str]:
	"""
	根据上传文件和 doc_type 提取课程大纲 topics。
	- 只有 doc_type == "Syllabus" 时才尝试从文件解析；否则用默认大纲。
	- 支持 .docx + .pdf
	"""
	if file_obj is None:
	return DEFAULT_COURSE_TOPICS

	if doc_type != "Syllabus":
	# 不是 Syllabus，就不要动课程大纲（可以用默认）
	return DEFAULT_COURSE_TOPICS

	# Gradio File 对象通常有 .name 属性（临时文件路径）
	file_path = getattr(file_obj, "name", None)
	if not file_path:
	return DEFAULT_COURSE_TOPICS

	ext = os.path.splitext(file_path)[1].lower()

	try:
	if ext == ".docx":
	topics = parse_syllabus_docx(file_path)
	elif ext == ".pdf":
	topics = parse_syllabus_pdf(file_path)
	else:
	print(f"[Syllabus] Unsupported file type for syllabus: {ext}")
	topics = DEFAULT_COURSE_TOPICS
	except Exception as e:
	print(f"[Syllabus] parse error: {repr(e)}")
	topics = DEFAULT_COURSE_TOPICS

	# 最后兜底，避免返回空列表
	return topics or DEFAULT_COURSE_TOPICS