File size: 10,702 Bytes
d94b56e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
# -*- coding: utf-8 -*-
#

#   pyhwp : hwp file format parser in python
#   Copyright (C) 2010-2023 mete0r <https://github.com/mete0r>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from argparse import ArgumentParser
from contextlib import contextmanager
from contextlib import closing
from functools import partial
import gettext
import io
import logging
import os.path
import shutil
import shutil
import sys
import base64
import re
import tempfile
import mimetypes

from . import __version__ as version
from .cli import init_logger
from .transforms import BaseTransform
from .utils import cached_property


PY3 = sys.version_info.major == 3
logger = logging.getLogger(__name__)
locale_dir = os.path.join(os.path.dirname(__file__), 'locale')
locale_dir = os.path.abspath(locale_dir)
t = gettext.translation('hwp5html', locale_dir, fallback=True)
_ = t.gettext


RESOURCE_PATH_XSL_CSS = 'xsl/hwp5css.xsl'
RESOURCE_PATH_XSL_XHTML = 'xsl/hwp5html.xsl'


class HTMLTransform(BaseTransform):

    @property
    def transform_hwp5_to_css(self):
        '''
        >>> T.transform_hwp5_to_css(hwp5file, 'styles.css')
        '''
        transform_xhwp5 = self.transform_xhwp5_to_css
        return self.make_transform_hwp5(transform_xhwp5)

    @property
    def transform_hwp5_to_xhtml(self):
        '''
        >>> T.transform_hwp5_to_xhtml(hwp5file, 'index.xhtml')
        '''
        transform_xhwp5 = self.transform_xhwp5_to_xhtml
        return self.make_transform_hwp5(transform_xhwp5)

    def transform_hwp5_to_dir(self, hwp5file, outdir):
        '''
        >>> T.transform_hwp5_to_dir(hwp5file, 'output')
        '''
        with self.transformed_xhwp5_at_temp(hwp5file) as xhwp5path:
            self.transform_xhwp5_to_dir(xhwp5path, outdir)

        bindata_dir = os.path.join(outdir, 'bindata')
        self.extract_bindata_dir(hwp5file, bindata_dir)

    @cached_property
    def transform_xhwp5_to_css(self):
        '''
        >>> T.transform_xhwp5_to_css('hwp5.xml', 'styles.css')
        '''
        resource_path = RESOURCE_PATH_XSL_CSS
        return self.make_xsl_transform(resource_path)

    @cached_property
    def transform_xhwp5_to_xhtml(self):
        '''
        >>> T.transform_xhwp5_to_xhtml('hwp5.xml', 'index.xhtml')
        '''
        resource_path = RESOURCE_PATH_XSL_XHTML
        return self.make_xsl_transform(resource_path)

    def transform_xhwp5_to_dir(self, xhwp5path, outdir):
        '''
        >>> T.transform_xhwp5_to_dir('hwp5.xml', 'output')
        '''
        html_path = os.path.join(outdir, 'index.xhtml')
        with io.open(html_path, 'wb') as f:
            self.transform_xhwp5_to_xhtml(xhwp5path, f)

        css_path = os.path.join(outdir, 'styles.css')
        with io.open(css_path, 'wb') as f:
            self.transform_xhwp5_to_css(xhwp5path, f)

    def transform_hwp5_to_single(self, hwp5file, outpath):
        """
        Convert HWP file to a single HTML file with embedded CSS and images.
        """
        # Create a temporary directory for intermediate conversion
        with tempfile.TemporaryDirectory() as temp_dir:
            # 1. Perform standard conversion to temp dir
            self.transform_hwp5_to_dir(hwp5file, temp_dir)
            
            # Paths to generated files
            html_path = os.path.join(temp_dir, 'index.xhtml')
            css_path = os.path.join(temp_dir, 'styles.css')
            bindata_dir = os.path.join(temp_dir, 'bindata')
            
            # 2. Read HTML and CSS
            if os.path.exists(html_path):
                with io.open(html_path, 'r', encoding='utf-8') as f:
                    html_content = f.read()
            else:
                raise RuntimeError("HTML generation failed")
                
            css_content = ""
            if os.path.exists(css_path):
                with io.open(css_path, 'r', encoding='utf-8') as f:
                    css_content = f.read()
            
            # 3. Embed CSS
            # Insert <style> tag before </head>
            if css_content:
                style_tag = f'<style>\n{css_content}\n</style>\n'
                html_content = html_content.replace('</head>', f'{style_tag}</head>')
                # Remove external link to css if present (optional, but good practice)
                # <link rel="stylesheet" type="text/css" href="styles.css" />
                html_content = re.sub(r'<link[^>]+href="styles.css"[^>]*/>', '', html_content)

            # 4. Embed Images
            if os.path.exists(bindata_dir):
                # Function to replace image src with base64 data
                def replace_image(match):
                    src = match.group(1)
                    if src.startswith('bindata/'):
                        image_filename = os.path.basename(src)
                        image_path = os.path.join(bindata_dir, image_filename)
                        if os.path.exists(image_path):
                            # Guess mime type
                            mime_type, _ = mimetypes.guess_type(image_path)
                            if not mime_type:
                                mime_type = 'image/png' # Default fallback
                                
                            with open(image_path, 'rb') as img_f:
                                img_data = img_f.read()
                                b64_data = base64.b64encode(img_data).decode('ascii')
                                return f'src="data:{mime_type};base64,{b64_data}"'
                    return match.group(0) # Return original if not matched/found
                
                # Replace src="bindata/..." with data URI
                # Regex looks for src="bindata/[^"]+"
                html_content = re.sub(r'src="(bindata/[^"]+)"', replace_image, html_content)
            
            # 5. Write final output
            with io.open(outpath, 'w', encoding='utf-8') as f:
                f.write(html_content)

    def extract_bindata_dir(self, hwp5file, bindata_dir):
        if 'BinData' not in hwp5file:
            return
        bindata_stg = hwp5file['BinData']
        if not os.path.exists(bindata_dir):
            os.mkdir(bindata_dir)

        from hwp5.storage import unpack
        unpack(bindata_stg, bindata_dir)


def main():
    from .dataio import ParseError
    from .errors import InvalidHwp5FileError
    from .utils import make_open_dest_file
    from .xmlmodel import Hwp5File

    argparser = main_argparser()
    args = argparser.parse_args()
    init_logger(args)

    hwp5path = args.hwp5file

    html_transform = HTMLTransform()

    open_dest = make_open_dest_file(args.output)
    if args.css:
        transform = html_transform.transform_hwp5_to_css
        open_dest = wrap_for_css(open_dest)
    elif args.html:
        transform = html_transform.transform_hwp5_to_xhtml
        open_dest = wrap_for_xml(open_dest)
    elif args.embed_image:
        transform = html_transform.transform_hwp5_to_single
        # For single file, we need a file path, not a file object
        # transform_hwp5_to_single expects a path string
        if not args.output:
            args.output = os.path.splitext(os.path.basename(hwp5path))[0] + '.html'
        open_dest = lambda: contextmanager(lambda: (yield args.output))()
    else:
        transform = html_transform.transform_hwp5_to_dir
        dest_path = args.output
        if not dest_path:
            dest_path = os.path.splitext(os.path.basename(hwp5path))[0]
        open_dest = partial(open_dir, dest_path)

    print(f"DEBUG: Input file: {hwp5path}")
    print(f"DEBUG: Args: css={args.css}, html={args.html}, embed_image={getattr(args, 'embed_image', False)}")

    try:
        with closing(Hwp5File(hwp5path)) as hwp5file:
            with open_dest() as dest:
                print(f"DEBUG: Starting transformation using {transform}")
                transform(hwp5file, dest)
                print("DEBUG: Transformation finished")
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('%s', e)
        sys.exit(1)


def main_argparser():
    parser = ArgumentParser(
        prog='hwp5html',
        description=_('HWPv5 to HTML converter'),
    )
    parser.add_argument(
        '--version',
        action='version',
        version='%(prog)s {}'.format(version)
    )
    parser.add_argument(
        '--loglevel',
        help=_('Set log level.'),
    )
    parser.add_argument(
        '--logfile',
        help=_('Set log file.'),
    )
    parser.add_argument(
        '--output',
        help=_('Output file'),
    )
    parser.add_argument(
        'hwp5file',
        metavar='<hwp5file>',
        help=_('.hwp file to convert'),
    )
    generator_group = parser.add_mutually_exclusive_group()
    generator_group.add_argument(
        '--css',
        action='store_true',
        help=_('Generate CSS'),
    )
    generator_group.add_argument(
        '--html',
        action='store_true',
        help=_('Generate HTML'),
    )
    generator_group.add_argument(
        '--embed-image',
        action='store_true',
        help=_('Embed images and CSS into a single HTML file'),
    )
    return parser


@contextmanager
def open_dir(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.mkdir(path)
    yield path


def wrap_for_css(open_dest):
    from .utils import wrap_open_dest_for_tty
    from .utils import pager
    from .utils import syntaxhighlight
    return wrap_open_dest_for_tty(open_dest, [
        pager(),
        syntaxhighlight('text/css'),
    ])


def wrap_for_xml(open_dest):
    from .utils import wrap_open_dest_for_tty
    from .utils import pager
    from .utils import syntaxhighlight
    from .utils import xmllint
    return wrap_open_dest_for_tty(open_dest, [
        pager(),
        syntaxhighlight('application/xml'),
        xmllint(format=True, nonet=True),
    ])


if __name__ == '__main__':
    main()