| """PowerPoint file processor.""" |
|
|
| import os |
| import logging |
| from typing import Dict, Any |
|
|
| from .base import BaseProcessor |
| from ..result import ConversionResult |
| from ..exceptions import ConversionError, FileNotFoundError |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class PPTXProcessor(BaseProcessor): |
| """Processor for PowerPoint files (PPT, PPTX).""" |
| |
| def can_process(self, file_path: str) -> bool: |
| """Check if this processor can handle the given file. |
| |
| Args: |
| file_path: Path to the file to check |
| |
| Returns: |
| True if this processor can handle the file |
| """ |
| if not os.path.exists(file_path): |
| return False |
| |
| |
| file_path_str = str(file_path) |
| _, ext = os.path.splitext(file_path_str.lower()) |
| return ext in ['.ppt', '.pptx'] |
| |
| def process(self, file_path: str) -> ConversionResult: |
| """Process the PowerPoint file and return a conversion result. |
| |
| Args: |
| file_path: Path to the PowerPoint file to process |
| |
| Returns: |
| ConversionResult containing the processed content |
| |
| Raises: |
| FileNotFoundError: If the file doesn't exist |
| ConversionError: If processing fails |
| """ |
| if not os.path.exists(file_path): |
| raise FileNotFoundError(f"File not found: {file_path}") |
| |
| |
| metadata = { |
| "file_path": file_path, |
| "file_size": os.path.getsize(file_path), |
| "processor": "PPTXProcessor" |
| } |
| |
| |
| file_path_str = str(file_path) |
| _, ext = os.path.splitext(file_path_str.lower()) |
| |
| if ext == '.ppt': |
| return self._process_ppt_file(file_path, metadata) |
| else: |
| return self._process_pptx_file(file_path, metadata) |
| |
| def _process_ppt_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult: |
| """Process .ppt files using pypandoc.""" |
| try: |
| import pypandoc |
| |
| |
| content = pypandoc.convert_file(file_path, 'markdown') |
| |
| metadata.update({ |
| "file_type": "ppt", |
| "extractor": "pypandoc" |
| }) |
| |
| |
| content = self._clean_content(content) |
| |
| return ConversionResult(content, metadata) |
| |
| except ImportError: |
| raise ConversionError("pypandoc is required for .ppt file processing. Install it with: pip install pypandoc") |
| except Exception as e: |
| raise ConversionError(f"Failed to process .ppt file {file_path}: {str(e)}") |
| |
| def _process_pptx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult: |
| """Process .pptx files using python-pptx.""" |
| try: |
| from pptx import Presentation |
| |
| content_parts = [] |
| prs = Presentation(file_path) |
| |
| metadata.update({ |
| "slide_count": len(prs.slides), |
| "file_type": "pptx", |
| "extractor": "python-pptx" |
| }) |
| |
| |
| preserve_layout = getattr(self, 'preserve_layout', False) |
| |
| for slide_num, slide in enumerate(prs.slides, 1): |
| if preserve_layout: |
| content_parts.append(f"\n## Slide {slide_num}\n") |
| |
| slide_content = [] |
| |
| for shape in slide.shapes: |
| if hasattr(shape, "text") and shape.text.strip(): |
| slide_content.append(shape.text.strip()) |
| |
| if slide_content: |
| content_parts.extend(slide_content) |
| content_parts.append("") |
| |
| content = "\n\n".join(content_parts) |
| |
| |
| content = self._clean_content(content) |
| |
| return ConversionResult(content, metadata) |
| |
| except ImportError: |
| raise ConversionError("python-pptx is required for .pptx file processing. Install it with: pip install python-pptx") |
| except Exception as e: |
| if isinstance(e, (FileNotFoundError, ConversionError)): |
| raise |
| raise ConversionError(f"Failed to process .pptx file {file_path}: {str(e)}") |
| |
| def _clean_content(self, content: str) -> str: |
| """Clean up the extracted PowerPoint content. |
| |
| Args: |
| content: Raw PowerPoint text content |
| |
| Returns: |
| Cleaned text content |
| """ |
| |
| lines = content.split('\n') |
| cleaned_lines = [] |
| |
| for line in lines: |
| |
| line = ' '.join(line.split()) |
| if line.strip(): |
| cleaned_lines.append(line) |
| |
| |
| content = '\n'.join(cleaned_lines) |
| |
| |
| content = content.replace('## Slide', '\n## Slide') |
| |
| return content.strip() |