Spaces:

oceanicdayi
/

AIML_learning

Runtime error

AIML_learning / scripts /extract_pptx.py

oceanicdayi

feat: add deployment configs for GitHub Pages and HF Spaces

c58a383 30 days ago

3.97 kB

	"""
	extract_pptx.py - 提取 PPTX 簡報的所有文字與圖片資訊

	使用方式:
	python scripts/extract_pptx.py <pptx_path>
	"""

	import sys
	from pathlib import Path
	from pptx import Presentation
	from pptx.util import Inches, Pt
	from pptx.enum.shapes import MSO_SHAPE_TYPE


	def extract_slide_content(pptx_path: str) -> list[dict]:
	prs = Presentation(pptx_path)
	slides_data = []

	for slide_num, slide in enumerate(prs.slides, 1):
	slide_info = {
	"slide_number": slide_num,
	"layout": slide.slide_layout.name if slide.slide_layout else "Unknown",
	"texts": [],
	"tables": [],
	"images": [],
	"notes": "",
	}

	# Extract notes
	if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
	slide_info["notes"] = slide.notes_slide.notes_text_frame.text.strip()

	for shape in slide.shapes:
	# Text frames
	if shape.has_text_frame:
	for para in shape.text_frame.paragraphs:
	text = para.text.strip()
	if text:
	level = para.level
	slide_info["texts"].append({"text": text, "level": level})

	# Tables
	if shape.has_table:
	table = shape.table
	table_data = []
	for row in table.rows:
	row_data = [cell.text.strip() for cell in row.cells]
	table_data.append(row_data)
	slide_info["tables"].append(table_data)

	# Images
	if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
	try:
	img = shape.image
	slide_info["images"].append({
	"content_type": img.content_type,
	"size": f"{shape.width}x{shape.height}",
	"name": shape.name,
	})
	except ValueError:
	slide_info["images"].append({
	"content_type": "linked",
	"size": f"{shape.width}x{shape.height}",
	"name": shape.name,
	})

	# Group shapes (recursive)
	if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
	for s in shape.shapes:
	if hasattr(s, "text_frame") and s.has_text_frame:
	for para in s.text_frame.paragraphs:
	text = para.text.strip()
	if text:
	slide_info["texts"].append({"text": text, "level": para.level})

	slides_data.append(slide_info)

	return slides_data


	def format_output(slides: list[dict]) -> str:
	lines = []
	for slide in slides:
	lines.append(f"\n{'='*60}")
	lines.append(f"## Slide {slide['slide_number']} (Layout: {slide['layout']})")
	lines.append(f"{'='*60}")

	for item in slide["texts"]:
	prefix = " " * item["level"] + "- " if item["level"] > 0 else ""
	lines.append(f"{prefix}{item['text']}")

	for table in slide["tables"]:
	lines.append("\n[TABLE]")
	for row in table:
	lines.append(" \| ".join(row))
	lines.append("[/TABLE]")

	if slide["images"]:
	lines.append(f"\n[IMAGES: {len(slide['images'])} image(s)]")
	for img in slide["images"]:
	lines.append(f" - {img['name']} ({img['content_type']})")

	if slide["notes"]:
	lines.append(f"\n[NOTES]: {slide['notes']}")

	return "\n".join(lines)


	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print("Usage: python extract_pptx.py <pptx_path>")
	sys.exit(1)

	pptx_path = sys.argv[1]
	slides = extract_slide_content(pptx_path)
	print(format_output(slides))
	print(f"\n\nTotal slides: {len(slides)}")