Spaces:
Paused
Paused
| from typing import Dict, List, Optional, Tuple | |
| import logging | |
| from pathlib import Path | |
| from pptx import Presentation | |
| from pptx.shapes.base import BaseShape | |
| from pptx.shapes.group import GroupShape | |
| from pptx.shapes.shapetree import SlideShapes | |
| from pptx.enum.shapes import MSO_SHAPE_TYPE | |
| from dataclasses import dataclass | |
| class ExtractedSlide: | |
| """Reprezentuje wyekstrahowan膮 zawarto艣膰 slajdu""" | |
| number: int | |
| title: str | |
| content: List[str] | |
| notes: Optional[str] | |
| has_tables: bool | |
| class PPTXConverter: | |
| """ | |
| Konwerter plik贸w PPTX wyspecjalizowany w dok艂adnej ekstrakcji tekstu | |
| z prezentacji przetargowych. | |
| """ | |
| def __init__(self): | |
| self.logger = logging.getLogger(__name__) | |
| self._setup_logging() | |
| def _setup_logging(self): | |
| """Konfiguracja systemu logowania""" | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler('pptx_conversion.log'), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| def convert_to_text(self, file_path: str) -> Tuple[str, Dict]: | |
| """ | |
| Konwertuje prezentacj臋 PPTX do tekstu z zachowaniem struktury. | |
| Args: | |
| file_path (str): 艢cie偶ka do pliku PPTX | |
| Returns: | |
| Tuple[str, Dict]: (pe艂ny_tekst, metadane) | |
| Raises: | |
| FileNotFoundError: Gdy plik nie istnieje | |
| ValueError: Gdy plik nie jest prawid艂owym PPTX | |
| """ | |
| try: | |
| # Sprawd藕 czy plik istnieje | |
| if not Path(file_path).exists(): | |
| raise FileNotFoundError(f"Plik nie istnieje: {file_path}") | |
| self.logger.info(f"Rozpocz臋to konwersj臋: {file_path}") | |
| prs = Presentation(file_path) | |
| extracted_slides = [] | |
| # Przetw贸rz ka偶dy slajd | |
| for slide_number, slide in enumerate(prs.slides, 1): | |
| self.logger.debug(f"Przetwarzanie slajdu {slide_number}") | |
| # Ekstrakcja tekstu ze slajdu | |
| slide_content = self._extract_slide(slide, slide_number) | |
| extracted_slides.append(slide_content) | |
| self.logger.debug(f"Zako艅czono przetwarzanie slajdu {slide_number}") | |
| # Z艂膮cz tekst ze wszystkich slajd贸w | |
| full_text = self._combine_slides_content(extracted_slides) | |
| # Przygotuj metadane | |
| metadata = self._prepare_metadata(prs, extracted_slides) | |
| self.logger.info(f"Zako艅czono konwersj臋: {file_path}") | |
| return full_text, metadata | |
| except Exception as e: | |
| self.logger.error(f"B艂膮d podczas konwersji PPTX: {str(e)}") | |
| raise | |
| def _extract_slide(self, slide, slide_number: int) -> ExtractedSlide: | |
| """ | |
| Ekstrahuje zawarto艣膰 pojedynczego slajdu. | |
| Args: | |
| slide: Obiekt slajdu z python-pptx | |
| slide_number (int): Numer slajdu | |
| Returns: | |
| ExtractedSlide: Wyekstrahowana zawarto艣膰 slajdu | |
| """ | |
| content_parts = [] | |
| title = None | |
| has_tables = False | |
| try: | |
| # 1. Ekstrakcja tekstu ze wszystkich shape'贸w | |
| shapes = self._get_all_shapes(slide.shapes) | |
| for shape in shapes: | |
| text = self._extract_shape_text(shape) | |
| if text: | |
| # Je艣li to tytu艂 slajdu | |
| if hasattr(shape, 'is_title') and shape.is_title: | |
| title = text | |
| else: | |
| content_parts.append(text) | |
| # Sprawd藕 czy shape zawiera tabel臋 | |
| if shape.shape_type == MSO_SHAPE_TYPE.TABLE: | |
| has_tables = True | |
| # 2. Ekstrakcja notatek | |
| notes = None | |
| if slide.has_notes_slide and slide.notes_slide: | |
| if hasattr(slide.notes_slide, 'notes_text_frame'): | |
| notes = slide.notes_slide.notes_text_frame.text.strip() | |
| return ExtractedSlide( | |
| number=slide_number, | |
| title=title or f"Slajd {slide_number}", | |
| content=content_parts, | |
| notes=notes, | |
| has_tables=has_tables | |
| ) | |
| except Exception as e: | |
| self.logger.error(f"B艂膮d podczas ekstrakcji slajdu {slide_number}: {str(e)}") | |
| # Zwr贸膰 pusty slajd w przypadku b艂臋du | |
| return ExtractedSlide( | |
| number=slide_number, | |
| title=f"Slajd {slide_number} (B艂膮d przetwarzania)", | |
| content=[], | |
| notes=None, | |
| has_tables=False | |
| ) | |
| def _get_all_shapes(self, shapes: SlideShapes) -> List[BaseShape]: | |
| """ | |
| Rekurencyjnie zbiera wszystkie shape'y, w艂膮cznie z tymi w grupach. | |
| Args: | |
| shapes (SlideShapes): Kolekcja shape'贸w ze slajdu | |
| Returns: | |
| List[BaseShape]: Lista wszystkich shape'贸w | |
| """ | |
| all_shapes = [] | |
| for shape in shapes: | |
| if isinstance(shape, GroupShape): | |
| all_shapes.extend(self._get_all_shapes(shape.shapes)) | |
| else: | |
| all_shapes.append(shape) | |
| return all_shapes | |
| def _extract_shape_text(self, shape: BaseShape) -> Optional[str]: | |
| """ | |
| Ekstrahuje tekst z pojedynczego shape'a, obs艂uguj膮c r贸偶ne typy. | |
| Args: | |
| shape (BaseShape): Shape do przetworzenia | |
| Returns: | |
| Optional[str]: Wyekstrahowany tekst lub None | |
| """ | |
| try: | |
| # 1. Prosty tekst | |
| if hasattr(shape, 'text'): | |
| text = shape.text.strip() | |
| if text: | |
| return text | |
| # 2. Tabela | |
| if shape.shape_type == MSO_SHAPE_TYPE.TABLE: | |
| table_rows = [] | |
| for row in shape.table.rows: | |
| row_texts = [cell.text.strip() for cell in row.cells] | |
| if any(row_texts): # je艣li wiersz nie jest pusty | |
| table_rows.append(' | '.join(row_texts)) | |
| if table_rows: | |
| return '\n'.join(table_rows) | |
| # 3. SmartArt | |
| if hasattr(shape, 'smart_art') and shape.smart_art: | |
| texts = [] | |
| for node in shape.smart_art.nodes: | |
| if node.text.strip(): | |
| texts.append(node.text.strip()) | |
| if texts: | |
| return ' > '.join(texts) | |
| # 4. Wykresy (podstawowa ekstrakcja tytu艂u i etykiet) | |
| if shape.shape_type == MSO_SHAPE_TYPE.CHART: | |
| chart_text = [] | |
| if hasattr(shape.chart, 'chart_title') and shape.chart.chart_title: | |
| chart_text.append(shape.chart.chart_title.text_frame.text) | |
| # Mo偶na doda膰 wi臋cej ekstrakcji z wykresu je艣li potrzeba | |
| if chart_text: | |
| return ' | '.join(chart_text) | |
| return None | |
| except Exception as e: | |
| self.logger.warning(f"B艂膮d podczas ekstrakcji tekstu z shape'a: {str(e)}") | |
| return None | |
| def _combine_slides_content(self, slides: List[ExtractedSlide]) -> str: | |
| """ | |
| 艁膮czy zawarto艣膰 wszystkich slajd贸w w jeden tekst. | |
| Args: | |
| slides (List[ExtractedSlide]): Lista przetworzonych slajd贸w | |
| Returns: | |
| str: Po艂膮czony tekst ze wszystkich slajd贸w | |
| """ | |
| text_parts = [] | |
| for slide in slides: | |
| # Dodaj nag艂贸wek slajdu | |
| text_parts.append(f"\n=== {slide.title} (Slajd {slide.number}) ===\n") | |
| # Dodaj g艂贸wn膮 zawarto艣膰 | |
| for content in slide.content: | |
| if content.strip(): | |
| text_parts.append(content) | |
| # Dodaj notatki je艣li s膮 | |
| if slide.notes: | |
| text_parts.append(f"\n[Notatki: {slide.notes}]\n") | |
| # Dodaj separator mi臋dzy slajdami | |
| text_parts.append("\n" + "-"*50 + "\n") | |
| return '\n'.join(text_parts) | |
| def _prepare_metadata(self, presentation: Presentation, | |
| slides: List[ExtractedSlide]) -> Dict: | |
| """ | |
| Przygotowuje metadane o prezentacji. | |
| Args: | |
| presentation (Presentation): Obiekt prezentacji | |
| slides (List[ExtractedSlide]): Lista przetworzonych slajd贸w | |
| Returns: | |
| Dict: Metadane o prezentacji i jej zawarto艣ci | |
| """ | |
| return { | |
| 'total_slides': len(presentation.slides), | |
| 'slides_with_notes': sum(1 for s in slides if s.notes), | |
| 'slides_with_tables': sum(1 for s in slides if s.has_tables), | |
| 'total_content_length': sum( | |
| len(content) for slide in slides | |
| for content in slide.content | |
| ), | |
| 'slide_structure': [ | |
| { | |
| 'number': s.number, | |
| 'title': s.title, | |
| 'content_length': sum(len(c) for c in s.content), | |
| 'has_notes': bool(s.notes), | |
| 'has_tables': s.has_tables | |
| } | |
| for s in slides | |
| ] | |
| } | |
| if __name__ == "__main__": | |
| # Przyk艂ad u偶ycia | |
| converter = PPTXConverter() | |
| try: | |
| text, metadata = converter.convert_to_text("example.pptx") | |
| print("Skonwertowany tekst:", text[:500], "...") | |
| print("\nMetadane:", metadata) | |
| except Exception as e: | |
| print(f"B艂膮d: {str(e)}") |