File size: 5,616 Bytes
5b14aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
"""PowerPoint file processor."""

import os
import logging
from typing import Dict, Any

from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError

# Configure logging
logger = logging.getLogger(__name__)


class PPTXProcessor(BaseProcessor):
    """Processor for PowerPoint files (PPT, PPTX)."""
    
    def can_process(self, file_path: str) -> bool:
        """Check if this processor can handle the given file.
        
        Args:
            file_path: Path to the file to check
            
        Returns:
            True if this processor can handle the file
        """
        if not os.path.exists(file_path):
            return False
        
        # Check file extension - ensure file_path is a string
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        return ext in ['.ppt', '.pptx']
    
    def process(self, file_path: str) -> ConversionResult:
        """Process the PowerPoint file and return a conversion result.
        
        Args:
            file_path: Path to the PowerPoint file to process
            
        Returns:
            ConversionResult containing the processed content
            
        Raises:
            FileNotFoundError: If the file doesn't exist
            ConversionError: If processing fails
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # Initialize metadata
        metadata = {
            "file_path": file_path,
            "file_size": os.path.getsize(file_path),
            "processor": "PPTXProcessor"
        }
        
        # Check file extension to determine processing method
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        
        if ext == '.ppt':
            return self._process_ppt_file(file_path, metadata)
        else:
            return self._process_pptx_file(file_path, metadata)
    
    def _process_ppt_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
        """Process .ppt files using pypandoc."""
        try:
            import pypandoc
            
            # Convert .ppt to markdown using pandoc
            content = pypandoc.convert_file(file_path, 'markdown')
            
            metadata.update({
                "file_type": "ppt",
                "extractor": "pypandoc"
            })
            
            # Clean up the content
            content = self._clean_content(content)
            
            return ConversionResult(content, metadata)
            
        except ImportError:
            raise ConversionError("pypandoc is required for .ppt file processing. Install it with: pip install pypandoc")
        except Exception as e:
            raise ConversionError(f"Failed to process .ppt file {file_path}: {str(e)}")
    
    def _process_pptx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
        """Process .pptx files using python-pptx."""
        try:
            from pptx import Presentation
            
            content_parts = []
            prs = Presentation(file_path)
            
            metadata.update({
                "slide_count": len(prs.slides),
                "file_type": "pptx",
                "extractor": "python-pptx"
            })
            
            # Check if preserve_layout is available (from base class or config)
            preserve_layout = getattr(self, 'preserve_layout', False)
            
            for slide_num, slide in enumerate(prs.slides, 1):
                if preserve_layout:
                    content_parts.append(f"\n## Slide {slide_num}\n")
                
                slide_content = []
                
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        slide_content.append(shape.text.strip())
                
                if slide_content:
                    content_parts.extend(slide_content)
                    content_parts.append("")  # Add spacing between slides
            
            content = "\n\n".join(content_parts)
            
            # Clean up the content
            content = self._clean_content(content)
            
            return ConversionResult(content, metadata)
            
        except ImportError:
            raise ConversionError("python-pptx is required for .pptx file processing. Install it with: pip install python-pptx")
        except Exception as e:
            if isinstance(e, (FileNotFoundError, ConversionError)):
                raise
            raise ConversionError(f"Failed to process .pptx file {file_path}: {str(e)}")
    
    def _clean_content(self, content: str) -> str:
        """Clean up the extracted PowerPoint content.
        
        Args:
            content: Raw PowerPoint text content
            
        Returns:
            Cleaned text content
        """
        # Remove excessive whitespace and normalize
        lines = content.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Remove excessive whitespace
            line = ' '.join(line.split())
            if line.strip():
                cleaned_lines.append(line)
        
        # Join lines and add proper spacing
        content = '\n'.join(cleaned_lines)
        
        # Add spacing around headers
        content = content.replace('## Slide', '\n## Slide')
        
        return content.strip()