Moses Paul R commited on
Commit
86ab66e
·
1 Parent(s): 0c97c78

add powerpoint

Browse files
Files changed (3) hide show
  1. marker/providers/powerpoint.py +252 -0
  2. poetry.lock +29 -1
  3. pyproject.toml +1 -0
marker/providers/powerpoint.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import traceback
4
+
5
+ from pptx import Presentation
6
+ from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
7
+ from weasyprint import CSS, HTML
8
+
9
+ from marker.providers.pdf import PdfProvider
10
+
11
+ css = '''
12
+ @page {
13
+ size: A4 landscape;
14
+ margin: 1.5cm;
15
+ }
16
+
17
+ table {
18
+ width: 100%;
19
+ border-collapse: collapse;
20
+ break-inside: auto;
21
+ font-size: 10pt;
22
+ }
23
+
24
+ tr {
25
+ break-inside: avoid;
26
+ page-break-inside: avoid;
27
+ }
28
+
29
+ td {
30
+ border: 0.75pt solid #000;
31
+ padding: 6pt;
32
+ }
33
+
34
+ img {
35
+ max-width: 100%;
36
+ height: auto;
37
+ object-fit: contain;
38
+ }
39
+ '''
40
+
41
+
42
+ class PowerPointProvider(PdfProvider):
43
+ include_slide_number: bool = False
44
+
45
+ def __init__(self, filepath: str, config=None):
46
+ home_dir = os.path.expanduser("~")
47
+ rel_path = os.path.relpath(filepath, home_dir)
48
+ base_name, _ = os.path.splitext(rel_path)
49
+ self.temp_pdf_path = os.path.join('/tmp', f"{base_name}.pdf")
50
+
51
+ # Convert PPTX to PDF
52
+ try:
53
+ self.convert_pptx_to_pdf(filepath)
54
+ except Exception as e:
55
+ print(traceback.format_exc())
56
+ raise ValueError(f"Error converting PPTX to PDF: {e}")
57
+
58
+ # Initalize the PDF provider with the temp pdf path
59
+ super().__init__(self.temp_pdf_path, config)
60
+
61
+ def __del__(self):
62
+ if os.path.exists(self.temp_pdf_path):
63
+ print(f"Deleting temporary PDF file: {self.temp_pdf_path}")
64
+ # os.remove(self.temp_pdf_path)
65
+
66
+ def convert_pptx_to_pdf(self, filepath):
67
+ pptx = Presentation(filepath)
68
+
69
+ html_parts = []
70
+
71
+ for slide_index, slide in enumerate(pptx.slides):
72
+ html_parts.append(f"<section>")
73
+ if self.include_slide_number:
74
+ html_parts.append(f"<h2>Slide {slide_index + 1}</h2>")
75
+
76
+ # Process shapes in the slide
77
+ for shape in slide.shapes:
78
+ # If shape is a group shape, we recursively handle all grouped shapes
79
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
80
+ html_parts.append(self._handle_group(shape))
81
+ continue
82
+
83
+ # If shape is a table
84
+ if shape.has_table:
85
+ html_parts.append(self._handle_table(shape))
86
+ continue
87
+
88
+ # If shape is a picture
89
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
90
+ html_parts.append(self._handle_image(shape))
91
+ continue
92
+
93
+ # If shape has text
94
+ if hasattr(shape, "text") and shape.text is not None:
95
+ if shape.has_text_frame:
96
+ # Distinguish placeholders (title, subtitle, etc.)
97
+ html_parts.append(self._handle_text(shape))
98
+ else:
99
+ html_parts.append(f"<p>{self._escape_html(shape.text)}</p>")
100
+
101
+ html_parts.append(f"</section>")
102
+
103
+ html = '\n'.join(html_parts)
104
+
105
+ # We convert the HTML into a PDF
106
+ open(self.temp_pdf_path + '.html', "w").write(html)
107
+ print(self.temp_pdf_path + '.html')
108
+
109
+ HTML(string=html).write_pdf(
110
+ self.temp_pdf_path,
111
+ stylesheets=[CSS(string=css)]
112
+ )
113
+
114
+ def _handle_group(self, group_shape) -> str:
115
+ """
116
+ Recursively handle shapes in a group. Returns HTML string for the entire group.
117
+ """
118
+
119
+ group_parts = []
120
+ for shape in group_shape.shapes:
121
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
122
+ group_parts.append(self._handle_group(shape))
123
+ continue
124
+
125
+ if shape.has_table:
126
+ group_parts.append(self._handle_table(shape))
127
+ continue
128
+
129
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
130
+ group_parts.append(self._handle_image(shape))
131
+ continue
132
+
133
+ if hasattr(shape, "text"):
134
+ if shape.has_text_frame:
135
+ group_parts.append(self._handle_text(shape))
136
+ else:
137
+ group_parts.append(f"<p>{self._escape_html(shape.text)}</p>")
138
+
139
+ return "".join(group_parts)
140
+
141
+ def _handle_text(self, shape) -> str:
142
+ """
143
+ Processes shape text, including bullet/numbered list detection and placeholders
144
+ (title, subtitle, etc.). Returns HTML for the text block(s).
145
+ """
146
+
147
+ # Distinguish placeholders to see if it's a title or subtitle
148
+ label_html_tag = "p"
149
+ if shape.is_placeholder:
150
+ placeholder_type = shape.placeholder_format.type
151
+ if placeholder_type in [PP_PLACEHOLDER.TITLE, PP_PLACEHOLDER.CENTER_TITLE]:
152
+ label_html_tag = "h3"
153
+ elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
154
+ label_html_tag = "h4"
155
+
156
+ # Keep track of whether we are currently in a <ul> or <ol>
157
+ html_parts = []
158
+ list_open = False
159
+ list_type = None # "ul" or "ol"
160
+
161
+ for paragraph in shape.text_frame.paragraphs:
162
+ p_el = paragraph._element
163
+ # Check bullet
164
+ bullet_char = p_el.find(".//a:buChar", namespaces=p_el.nsmap)
165
+ bullet_num = p_el.find(".//a:buAutoNum", namespaces=p_el.nsmap)
166
+
167
+ is_bullet = (bullet_char is not None) or (paragraph.level > 0)
168
+ is_numbered = (bullet_num is not None)
169
+
170
+ # If the paragraph is bullet or numbered
171
+ if is_bullet or is_numbered:
172
+ # Decide if we need to start a new list or continue an existing one
173
+ current_list_type = "ol" if is_numbered else "ul"
174
+ if not list_open:
175
+ # Start new
176
+ list_open = True
177
+ list_type = current_list_type
178
+ html_parts.append(f"<{list_type}>")
179
+
180
+ elif list_open and list_type != current_list_type:
181
+ # Close old list, start new
182
+ html_parts.append(f"</{list_type}>")
183
+ list_type = current_list_type
184
+ html_parts.append(f"<{list_type}>")
185
+
186
+ # Build the bullet (li) text from all runs in the paragraph
187
+ p_text = "".join(run.text for run in paragraph.runs)
188
+ if p_text:
189
+ html_parts.append(f"<li>{self._escape_html(p_text)}</li>")
190
+
191
+ else:
192
+ # If we were in a list, we need to close it
193
+ if list_open:
194
+ html_parts.append(f"</{list_type}>")
195
+ list_open = False
196
+ list_type = None
197
+
198
+ # Now it's just a normal paragraph
199
+ # Gather the paragraph text from runs
200
+ p_text = "".join(run.text for run in paragraph.runs)
201
+ if p_text:
202
+ # If we know it's a slide title, we can use <h3> or so
203
+ html_parts.append(f"<{label_html_tag}>{self._escape_html(p_text)}</{label_html_tag}>")
204
+
205
+ # If the text frame ended and we still have an open list, close it
206
+ if list_open:
207
+ html_parts.append(f"</{list_type}>")
208
+
209
+ return "".join(html_parts)
210
+
211
+ def _handle_image(self, shape) -> str:
212
+ """
213
+ Embeds the image as a base64 <img> in HTML.
214
+ """
215
+ image = shape.image
216
+ image_bytes = image.blob
217
+
218
+ try:
219
+ img_str = base64.b64encode(image_bytes).decode('utf-8')
220
+ return f"<img src='data:{image.content_type};base64,{img_str}' />"
221
+ except Exception as e:
222
+ print(f"Warning: image cannot be loaded by Pillow: {e}")
223
+ return ""
224
+
225
+ def _handle_table(self, shape) -> str:
226
+ """
227
+ Renders a shape's table as an HTML <table>.
228
+ """
229
+ table_html = []
230
+ table_html.append("<table border='1'>")
231
+
232
+ for row in shape.table.rows:
233
+ row_html = ["<tr>"]
234
+ for cell in row.cells:
235
+ row_html.append(f"<td>{self._escape_html(cell.text)}</td>")
236
+ row_html.append("</tr>")
237
+ table_html.append("".join(row_html))
238
+
239
+ table_html.append("</table>")
240
+ return "".join(table_html)
241
+
242
+ def _escape_html(self, text: str) -> str:
243
+ """
244
+ Minimal escaping for HTML special characters.
245
+ """
246
+ return (
247
+ text.replace("&", "&amp;")
248
+ .replace("<", "&lt;")
249
+ .replace(">", "&gt;")
250
+ .replace('"', "&quot;")
251
+ .replace("'", "&#39;")
252
+ )
poetry.lock CHANGED
@@ -4037,6 +4037,23 @@ files = [
4037
  {file = "python_multipart-0.0.16.tar.gz", hash = "sha256:8dee37b88dab9b59922ca173c35acb627cc12ec74019f5cd4578369c6df36554"},
4038
  ]
4039
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4040
  [[package]]
4041
  name = "pytz"
4042
  version = "2024.2"
@@ -5645,6 +5662,17 @@ files = [
5645
  {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
5646
  ]
5647
 
 
 
 
 
 
 
 
 
 
 
 
5648
  [[package]]
5649
  name = "xxhash"
5650
  version = "3.5.0"
@@ -5959,4 +5987,4 @@ test = ["pytest"]
5959
  [metadata]
5960
  lock-version = "2.0"
5961
  python-versions = "^3.10"
5962
- content-hash = "42e3b1c26e61c9e61909cdd10ffa28b2218e265c3fb4d62f00466aa91434429b"
 
4037
  {file = "python_multipart-0.0.16.tar.gz", hash = "sha256:8dee37b88dab9b59922ca173c35acb627cc12ec74019f5cd4578369c6df36554"},
4038
  ]
4039
 
4040
+ [[package]]
4041
+ name = "python-pptx"
4042
+ version = "1.0.2"
4043
+ description = "Create, read, and update PowerPoint 2007+ (.pptx) files."
4044
+ optional = false
4045
+ python-versions = ">=3.8"
4046
+ files = [
4047
+ {file = "python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba"},
4048
+ {file = "python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095"},
4049
+ ]
4050
+
4051
+ [package.dependencies]
4052
+ lxml = ">=3.1.0"
4053
+ Pillow = ">=3.3.2"
4054
+ typing-extensions = ">=4.9.0"
4055
+ XlsxWriter = ">=0.5.7"
4056
+
4057
  [[package]]
4058
  name = "pytz"
4059
  version = "2024.2"
 
5662
  {file = "widgetsnbextension-4.0.13.tar.gz", hash = "sha256:ffcb67bc9febd10234a362795f643927f4e0c05d9342c727b65d2384f8feacb6"},
5663
  ]
5664
 
5665
+ [[package]]
5666
+ name = "xlsxwriter"
5667
+ version = "3.2.1"
5668
+ description = "A Python module for creating Excel XLSX files."
5669
+ optional = false
5670
+ python-versions = ">=3.6"
5671
+ files = [
5672
+ {file = "XlsxWriter-3.2.1-py3-none-any.whl", hash = "sha256:7e8f7c60b7a1660ef791d46ab5de78469cb978b991ca841af61f5832d2f9f4fe"},
5673
+ {file = "XlsxWriter-3.2.1.tar.gz", hash = "sha256:97618759cb264fb6a93397f660cca156ffa9561743b1823dafb60dc4474e1902"},
5674
+ ]
5675
+
5676
  [[package]]
5677
  name = "xxhash"
5678
  version = "3.5.0"
 
5987
  [metadata]
5988
  lock-version = "2.0"
5989
  python-versions = "^3.10"
5990
+ content-hash = "429b563e9a609f51ba8185407ef5ef1219caf582b09386f5dca4740ff4386ada"
pyproject.toml CHANGED
@@ -39,6 +39,7 @@ scikit-learn = "^1.6.1"
39
  mammoth = "^1.9.0"
40
  weasyprint = "^63.1"
41
  openpyxl = "^3.1.5"
 
42
 
43
  [tool.poetry.group.dev.dependencies]
44
  jupyter = "^1.0.0"
 
39
  mammoth = "^1.9.0"
40
  weasyprint = "^63.1"
41
  openpyxl = "^3.1.5"
42
+ python-pptx = "^1.0.2"
43
 
44
  [tool.poetry.group.dev.dependencies]
45
  jupyter = "^1.0.0"