File size: 3,966 Bytes
c58a383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
extract_pptx.py - 提取 PPTX 簡報的所有文字與圖片資訊

使用方式:
    python scripts/extract_pptx.py <pptx_path>
"""

import sys
from pathlib import Path
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.shapes import MSO_SHAPE_TYPE


def extract_slide_content(pptx_path: str) -> list[dict]:
    prs = Presentation(pptx_path)
    slides_data = []

    for slide_num, slide in enumerate(prs.slides, 1):
        slide_info = {
            "slide_number": slide_num,
            "layout": slide.slide_layout.name if slide.slide_layout else "Unknown",
            "texts": [],
            "tables": [],
            "images": [],
            "notes": "",
        }

        # Extract notes
        if slide.has_notes_slide and slide.notes_slide.notes_text_frame:
            slide_info["notes"] = slide.notes_slide.notes_text_frame.text.strip()

        for shape in slide.shapes:
            # Text frames
            if shape.has_text_frame:
                for para in shape.text_frame.paragraphs:
                    text = para.text.strip()
                    if text:
                        level = para.level
                        slide_info["texts"].append({"text": text, "level": level})

            # Tables
            if shape.has_table:
                table = shape.table
                table_data = []
                for row in table.rows:
                    row_data = [cell.text.strip() for cell in row.cells]
                    table_data.append(row_data)
                slide_info["tables"].append(table_data)

            # Images
            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
                try:
                    img = shape.image
                    slide_info["images"].append({
                        "content_type": img.content_type,
                        "size": f"{shape.width}x{shape.height}",
                        "name": shape.name,
                    })
                except ValueError:
                    slide_info["images"].append({
                        "content_type": "linked",
                        "size": f"{shape.width}x{shape.height}",
                        "name": shape.name,
                    })

            # Group shapes (recursive)
            if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
                for s in shape.shapes:
                    if hasattr(s, "text_frame") and s.has_text_frame:
                        for para in s.text_frame.paragraphs:
                            text = para.text.strip()
                            if text:
                                slide_info["texts"].append({"text": text, "level": para.level})

        slides_data.append(slide_info)

    return slides_data


def format_output(slides: list[dict]) -> str:
    lines = []
    for slide in slides:
        lines.append(f"\n{'='*60}")
        lines.append(f"## Slide {slide['slide_number']} (Layout: {slide['layout']})")
        lines.append(f"{'='*60}")

        for item in slide["texts"]:
            prefix = "  " * item["level"] + "- " if item["level"] > 0 else ""
            lines.append(f"{prefix}{item['text']}")

        for table in slide["tables"]:
            lines.append("\n[TABLE]")
            for row in table:
                lines.append(" | ".join(row))
            lines.append("[/TABLE]")

        if slide["images"]:
            lines.append(f"\n[IMAGES: {len(slide['images'])} image(s)]")
            for img in slide["images"]:
                lines.append(f"  - {img['name']} ({img['content_type']})")

        if slide["notes"]:
            lines.append(f"\n[NOTES]: {slide['notes']}")

    return "\n".join(lines)


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python extract_pptx.py <pptx_path>")
        sys.exit(1)

    pptx_path = sys.argv[1]
    slides = extract_slide_content(pptx_path)
    print(format_output(slides))
    print(f"\n\nTotal slides: {len(slides)}")