File size: 5,616 Bytes
5b14aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | """PowerPoint file processor."""
import os
import logging
from typing import Dict, Any
from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError
# Configure logging
logger = logging.getLogger(__name__)
class PPTXProcessor(BaseProcessor):
"""Processor for PowerPoint files (PPT, PPTX)."""
def can_process(self, file_path: str) -> bool:
"""Check if this processor can handle the given file.
Args:
file_path: Path to the file to check
Returns:
True if this processor can handle the file
"""
if not os.path.exists(file_path):
return False
# Check file extension - ensure file_path is a string
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
return ext in ['.ppt', '.pptx']
def process(self, file_path: str) -> ConversionResult:
"""Process the PowerPoint file and return a conversion result.
Args:
file_path: Path to the PowerPoint file to process
Returns:
ConversionResult containing the processed content
Raises:
FileNotFoundError: If the file doesn't exist
ConversionError: If processing fails
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# Initialize metadata
metadata = {
"file_path": file_path,
"file_size": os.path.getsize(file_path),
"processor": "PPTXProcessor"
}
# Check file extension to determine processing method
file_path_str = str(file_path)
_, ext = os.path.splitext(file_path_str.lower())
if ext == '.ppt':
return self._process_ppt_file(file_path, metadata)
else:
return self._process_pptx_file(file_path, metadata)
def _process_ppt_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
"""Process .ppt files using pypandoc."""
try:
import pypandoc
# Convert .ppt to markdown using pandoc
content = pypandoc.convert_file(file_path, 'markdown')
metadata.update({
"file_type": "ppt",
"extractor": "pypandoc"
})
# Clean up the content
content = self._clean_content(content)
return ConversionResult(content, metadata)
except ImportError:
raise ConversionError("pypandoc is required for .ppt file processing. Install it with: pip install pypandoc")
except Exception as e:
raise ConversionError(f"Failed to process .ppt file {file_path}: {str(e)}")
def _process_pptx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
"""Process .pptx files using python-pptx."""
try:
from pptx import Presentation
content_parts = []
prs = Presentation(file_path)
metadata.update({
"slide_count": len(prs.slides),
"file_type": "pptx",
"extractor": "python-pptx"
})
# Check if preserve_layout is available (from base class or config)
preserve_layout = getattr(self, 'preserve_layout', False)
for slide_num, slide in enumerate(prs.slides, 1):
if preserve_layout:
content_parts.append(f"\n## Slide {slide_num}\n")
slide_content = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_content.append(shape.text.strip())
if slide_content:
content_parts.extend(slide_content)
content_parts.append("") # Add spacing between slides
content = "\n\n".join(content_parts)
# Clean up the content
content = self._clean_content(content)
return ConversionResult(content, metadata)
except ImportError:
raise ConversionError("python-pptx is required for .pptx file processing. Install it with: pip install python-pptx")
except Exception as e:
if isinstance(e, (FileNotFoundError, ConversionError)):
raise
raise ConversionError(f"Failed to process .pptx file {file_path}: {str(e)}")
def _clean_content(self, content: str) -> str:
"""Clean up the extracted PowerPoint content.
Args:
content: Raw PowerPoint text content
Returns:
Cleaned text content
"""
# Remove excessive whitespace and normalize
lines = content.split('\n')
cleaned_lines = []
for line in lines:
# Remove excessive whitespace
line = ' '.join(line.split())
if line.strip():
cleaned_lines.append(line)
# Join lines and add proper spacing
content = '\n'.join(cleaned_lines)
# Add spacing around headers
content = content.replace('## Slide', '\n## Slide')
return content.strip() |