File size: 7,794 Bytes
5b14aa2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""DOCX file processor."""

import os
from typing import Dict, Any

from .base import BaseProcessor
from ..result import ConversionResult
from ..exceptions import ConversionError, FileNotFoundError


class DOCXProcessor(BaseProcessor):
    """Processor for Microsoft Word DOCX and DOC files."""
    
    def can_process(self, file_path: str) -> bool:
        """Check if this processor can handle the given file.
        
        Args:
            file_path: Path to the file to check
            
        Returns:
            True if this processor can handle the file
        """
        if not os.path.exists(file_path):
            return False
        
        # Check file extension - ensure file_path is a string
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        return ext in ['.docx', '.doc']
    
    def process(self, file_path: str) -> ConversionResult:
        """Process the DOCX file and return a conversion result.
        
        Args:
            file_path: Path to the DOCX file to process
            
        Returns:
            ConversionResult containing the processed content
            
        Raises:
            FileNotFoundError: If the file doesn't exist
            ConversionError: If processing fails
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")
        
        # Initialize metadata
        metadata = {
            "file_path": file_path,
            "file_size": os.path.getsize(file_path),
            "processor": "DOCXProcessor"
        }
        
        # Check file extension - ensure file_path is a string
        file_path_str = str(file_path)
        _, ext = os.path.splitext(file_path_str.lower())
        
        if ext == '.doc':
            return self._process_doc_file(file_path, metadata)
        else:
            return self._process_docx_file(file_path, metadata)
    
    def _process_doc_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
        """Process .doc files using pypandoc."""
        try:
            import pypandoc
            
            # Convert .doc to markdown using pandoc
            content = pypandoc.convert_file(file_path, 'markdown')
            
            metadata.update({
                "file_type": "doc",
                "extractor": "pypandoc"
            })
            
            # Clean up the content
            content = self._clean_content(content)
            
            return ConversionResult(content, metadata)
            
        except ImportError:
            raise ConversionError("pypandoc is required for .doc file processing. Install it with: pip install pypandoc")
        except Exception as e:
            raise ConversionError(f"Failed to process .doc file {file_path}: {str(e)}")
    
    def _process_docx_file(self, file_path: str, metadata: Dict[str, Any]) -> ConversionResult:
        """Process .docx files using python-docx with improved table extraction."""
        try:
            from docx import Document

            content_parts = []
            doc = Document(file_path)

            metadata.update({
                "paragraph_count": len(doc.paragraphs),
                "section_count": len(doc.sections),
                "file_type": "docx",
                "extractor": "python-docx"
            })

            # Extract text from paragraphs
            for paragraph in doc.paragraphs:
                if paragraph.text.strip():
                    # Check if this is a heading
                    if paragraph.style.name.startswith('Heading'):
                        level = paragraph.style.name.replace('Heading ', '')
                        try:
                            level_num = int(level)
                            content_parts.append(f"\n{'#' * min(level_num, 6)} {paragraph.text}\n")
                        except ValueError:
                            content_parts.append(f"\n## {paragraph.text}\n")
                    else:
                        content_parts.append(paragraph.text)

            # Extract text from tables (improved)
            for table_idx, table in enumerate(doc.tables):
                # Check if preserve_layout is available (from base class or config)
                preserve_layout = getattr(self, 'preserve_layout', False)
                if preserve_layout:
                    content_parts.append(f"\n### Table {table_idx+1}\n")

                # Gather all rows
                rows = table.rows
                if not rows:
                    continue

                # Detect merged cells (optional warning)
                merged_warning = False
                for row in rows:
                    for cell in row.cells:
                        if len(cell._tc.xpath('.//w:vMerge')) > 0 or len(cell._tc.xpath('.//w:gridSpan')) > 0:
                            merged_warning = True
                            break
                    if merged_warning:
                        break
                if merged_warning:
                    content_parts.append("*Warning: Table contains merged cells which may not render correctly in markdown.*\n")

                # Row limit for large tables
                row_limit = 20
                if len(rows) > row_limit:
                    content_parts.append(f"*Table truncated to first {row_limit} rows out of {len(rows)} total.*\n")

                # Build table data
                table_data = []
                for i, row in enumerate(rows):
                    if i >= row_limit:
                        break
                    row_data = [cell.text.strip().replace('\n', ' ') for cell in row.cells]
                    table_data.append(row_data)

                # Ensure all rows have the same number of columns
                max_cols = max(len(r) for r in table_data)
                for r in table_data:
                    while len(r) < max_cols:
                        r.append("")

                # Markdown table: first row as header
                if table_data:
                    header = table_data[0]
                    separator = ["---"] * len(header)
                    content_parts.append("| " + " | ".join(header) + " |")
                    content_parts.append("| " + " | ".join(separator) + " |")
                    for row in table_data[1:]:
                        content_parts.append("| " + " | ".join(row) + " |")
                    content_parts.append("")

            content = '\n'.join(content_parts)
            content = self._clean_content(content)
            return ConversionResult(content, metadata)
        except ImportError:
            raise ConversionError("python-docx is required for .docx file processing. Install it with: pip install python-docx")
        except Exception as e:
            raise ConversionError(f"Failed to process .docx file {file_path}: {str(e)}")
    
    def _clean_content(self, content: str) -> str:
        """Clean up the extracted Word content.
        
        Args:
            content: Raw Word text content
            
        Returns:
            Cleaned text content
        """
        # Remove excessive whitespace and normalize
        lines = content.split('\n')
        cleaned_lines = []
        
        for line in lines:
            # Remove excessive whitespace
            line = ' '.join(line.split())
            if line.strip():
                cleaned_lines.append(line)
        
        # Join lines and add proper spacing
        content = '\n'.join(cleaned_lines)
        
        # Add spacing around headers
        content = content.replace('## ', '\n## ')
        content = content.replace('### ', '\n### ')
        
        return content.strip()