Spaces:
Paused
Paused
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # | |
| from io import BytesIO | |
| from pptx import Presentation | |
| class RAGFlowPptParser(object): | |
| def __init__(self): | |
| super().__init__() | |
| def __extract(self, shape): | |
| if shape.shape_type == 19: | |
| tb = shape.table | |
| rows = [] | |
| for i in range(1, len(tb.rows)): | |
| rows.append("; ".join([tb.cell( | |
| 0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) | |
| return "\n".join(rows) | |
| if shape.has_text_frame: | |
| return shape.text_frame.text | |
| if shape.shape_type == 6: | |
| texts = [] | |
| for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): | |
| t = self.__extract(p) | |
| if t: | |
| texts.append(t) | |
| return "\n".join(texts) | |
| def __call__(self, fnm, from_page, to_page, callback=None): | |
| ppt = Presentation(fnm) if isinstance( | |
| fnm, str) else Presentation( | |
| BytesIO(fnm)) | |
| txts = [] | |
| self.total_page = len(ppt.slides) | |
| for i, slide in enumerate(ppt.slides): | |
| if i < from_page: | |
| continue | |
| if i >= to_page: | |
| break | |
| texts = [] | |
| for shape in sorted( | |
| slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)): | |
| txt = self.__extract(shape) | |
| if txt: | |
| texts.append(txt) | |
| txts.append("\n".join(texts)) | |
| return txts | |