TIMBOVILL commited on
Commit
59acea6
·
verified ·
1 Parent(s): 375cd65

Upload 3 files

Browse files
Files changed (3) hide show
  1. requirements 2.txt +4 -0
  2. setup.py +42 -0
  3. webpage2html.py +396 -0
requirements 2.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ beautifulsoup4>=4.0.0
2
+ lxml>=3.4.4
3
+ requests>=2.5.2
4
+ termcolor>=1.1.0
setup.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from distutils.core import setup
2
+
3
+ setup(
4
+ name='webpage2html',
5
+ version='0.3.7',
6
+
7
+ author='Wenlei Zhu',
8
+ author_email='i@ztrix.me',
9
+ url='https://github.com/zTrix/webpage2html',
10
+
11
+ license='LICENSE.txt',
12
+ keywords="webpage html convert",
13
+ description='Save/convert web pages to a single editable html file',
14
+ long_description='View https://github.com/zTrix/webpage2html for project description and usage',
15
+
16
+ py_modules=['webpage2html'],
17
+
18
+ # Refers to test/test.py
19
+ test_suite='test.test',
20
+
21
+ entry_points={
22
+ 'console_scripts': [
23
+ 'webpage2html=webpage2html:main'
24
+ ]
25
+ },
26
+ classifiers=[
27
+ 'Development Status :: 5 - Production/Stable',
28
+ 'Environment :: Console',
29
+ 'Intended Audience :: Developers',
30
+ 'Operating System :: POSIX',
31
+ 'Operating System :: MacOS :: MacOS X',
32
+ 'Programming Language :: Python',
33
+ 'Programming Language :: Python :: 2.6',
34
+ 'Programming Language :: Python :: 2.7',
35
+ 'Programming Language :: Python :: 3.5',
36
+ 'Programming Language :: Python :: 3.6',
37
+ 'Topic :: Software Development',
38
+ 'Topic :: System',
39
+ 'Topic :: Terminals',
40
+ 'Topic :: Utilities',
41
+ ],
42
+ )
webpage2html.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ from __future__ import print_function
5
+
6
+ import argparse
7
+ import base64
8
+ import codecs
9
+ import datetime
10
+ import os
11
+ import re
12
+ import sys
13
+
14
+ import requests
15
+ from bs4 import BeautifulSoup
16
+ from termcolor import colored
17
+
18
+ if sys.version > '3':
19
+ from urllib.parse import urlparse, urlunsplit, urljoin, quote
20
+ else:
21
+ from urlparse import urlparse, urlunsplit, urljoin
22
+ from urllib import quote
23
+
24
+ re_css_url = re.compile(r'(url\(.*?\))')
25
+ webpage2html_cache = {}
26
+
27
+
28
+ def log(s, color=None, on_color=None, attrs=None, new_line=True):
29
+ if not color:
30
+ print(str(s), end=' ', file=sys.stderr)
31
+ else:
32
+ print(colored(str(s), color, on_color, attrs), end=' ', file=sys.stderr)
33
+ if new_line:
34
+ sys.stderr.write('\n')
35
+ sys.stderr.flush()
36
+
37
+
38
+ def absurl(index, relpath=None, normpath=None):
39
+ if normpath is None:
40
+ normpath = lambda x: x
41
+ if index.lower().startswith('http') or (relpath and relpath.startswith('http')):
42
+ new = urlparse(urljoin(index, relpath))
43
+ return urlunsplit((new.scheme, new.netloc, normpath(new.path), new.query, ''))
44
+ # normpath不是函数,为什么这里一直用normpath(path)这种格式
45
+ # netloc contains basic auth, so do not use domain
46
+ else:
47
+ if relpath:
48
+ return normpath(os.path.join(os.path.dirname(index), relpath))
49
+ else:
50
+ return index
51
+
52
+
53
+ def get(index, relpath=None, verbose=True, usecache=True, verify=True, ignore_error=False, username=None, password=None):
54
+ global webpage2html_cache
55
+ if index.startswith('http') or (relpath and relpath.startswith('http')):
56
+ full_path = absurl(index, relpath)
57
+ if not full_path:
58
+ if verbose:
59
+ log('[ WARN ] invalid path, %s %s' % (index, relpath), 'yellow')
60
+ return '', None
61
+ # urllib2 only accepts valid url, the following code is taken from urllib
62
+ # http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780
63
+ full_path = quote(full_path, safe="%/:=&?~#+!$,;'@()*[]")
64
+ if usecache:
65
+ if full_path in webpage2html_cache:
66
+ if verbose:
67
+ log('[ CACHE HIT ] - %s' % full_path)
68
+ return webpage2html_cache[full_path], None
69
+ headers = {
70
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
71
+ }
72
+
73
+ auth = None
74
+ if username and password:
75
+ auth = requests.auth.HTTPBasicAuth(username, password)
76
+
77
+ try:
78
+ response = requests.get(full_path, headers=headers, verify=verify, auth=auth)
79
+ if verbose:
80
+ log('[ GET ] %d - %s' % (response.status_code, response.url))
81
+ if not ignore_error and (response.status_code >= 400 or response.status_code < 200):
82
+ content = ''
83
+ elif response.headers.get('content-type', '').lower().startswith('text/'):
84
+ content = response.text
85
+ else:
86
+ content = response.content
87
+ if usecache:
88
+ webpage2html_cache[response.url] = content
89
+ return content, {'url': response.url, 'content-type': response.headers.get('content-type')}
90
+ except Exception as ex:
91
+ if verbose:
92
+ log('[ WARN ] %s - %s %s' % ('???', full_path, ex), 'yellow')
93
+ return '', None
94
+ elif os.path.exists(index):
95
+ if relpath:
96
+ relpath = relpath.split('#')[0].split('?')[0]
97
+ if os.path.exists(relpath):
98
+ full_path = relpath
99
+ else:
100
+ full_path = os.path.normpath(os.path.join(os.path.dirname(index), relpath))
101
+ try:
102
+ ret = open(full_path, 'rb').read()
103
+ if verbose:
104
+ log('[ LOCAL ] found - %s' % full_path)
105
+ return ret, None
106
+ except IOError as err:
107
+ if verbose:
108
+ log('[ WARN ] file not found - %s %s' % (full_path, str(err)), 'yellow')
109
+ return '', None
110
+ else:
111
+ try:
112
+ ret = open(index, 'rb').read()
113
+ if verbose:
114
+ log('[ LOCAL ] found - %s' % index)
115
+ return ret, None
116
+ except IOError as err:
117
+ if verbose:
118
+ log('[ WARN ] file not found - %s %s' % (index, str(err)), 'yellow')
119
+ return '', None
120
+ else:
121
+ if verbose:
122
+ log('[ ERROR ] invalid index - %s' % index, 'red')
123
+ return '', None
124
+
125
+
126
+ def data_to_base64(index, src, verbose=True):
127
+ # doc here: http://en.wikipedia.org/wiki/Data_URI_scheme
128
+ sp = urlparse(src).path.lower()
129
+ if src.strip().startswith('data:'):
130
+ return src
131
+ if sp.endswith('.png'):
132
+ fmt = 'image/png'
133
+ elif sp.endswith('.gif'):
134
+ fmt = 'image/gif'
135
+ elif sp.endswith('.ico'):
136
+ fmt = 'image/x-icon'
137
+ elif sp.endswith('.jpg') or sp.endswith('.jpeg'):
138
+ fmt = 'image/jpg'
139
+ elif sp.endswith('.svg'):
140
+ fmt = 'image/svg+xml'
141
+ elif sp.endswith('.ttf'):
142
+ fmt = 'application/x-font-ttf'
143
+ elif sp.endswith('.otf'):
144
+ fmt = 'application/x-font-opentype'
145
+ elif sp.endswith('.woff'):
146
+ fmt = 'application/font-woff'
147
+ elif sp.endswith('.woff2'):
148
+ fmt = 'application/font-woff2'
149
+ elif sp.endswith('.eot'):
150
+ fmt = 'application/vnd.ms-fontobject'
151
+ elif sp.endswith('.sfnt'):
152
+ fmt = 'application/font-sfnt'
153
+ elif sp.endswith('.css') or sp.endswith('.less'):
154
+ fmt = 'text/css'
155
+ elif sp.endswith('.js'):
156
+ fmt = 'application/javascript'
157
+ else:
158
+ # what if it's not a valid font type? may not matter
159
+ fmt = 'image/png'
160
+ data, extra_data = get(index, src, verbose=verbose)
161
+ if extra_data and extra_data.get('content-type'):
162
+ fmt = extra_data.get('content-type').replace(' ', '')
163
+ if data:
164
+ if sys.version > '3':
165
+ if type(data) is bytes:
166
+ return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(data))
167
+ else:
168
+ return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(str.encode(data)))
169
+ else:
170
+ reload(sys)
171
+ sys.setdefaultencoding('utf-8')
172
+ return ('data:%s;base64,' % fmt) + base64.b64encode(data)
173
+ else:
174
+ return absurl(index, src)
175
+
176
+
177
+ css_encoding_re = re.compile(r'''@charset\s+["']([-_a-zA-Z0-9]+)["']\;''', re.I)
178
+
179
+
180
+ def handle_css_content(index, css, verbose=True):
181
+ if not css:
182
+ return css
183
+ if not isinstance(css, str):
184
+ if sys.version > '3':
185
+ css = bytes.decode(css)
186
+ mo = css_encoding_re.search(css)
187
+ else:
188
+ mo = css_encoding_re.search(css)
189
+ if mo:
190
+ try:
191
+ css = css.decode(mo.group(1))
192
+ except:
193
+ log('[ WARN ] failed to convert css to encoding %s' % mo.group(1), 'yellow')
194
+ # Watch out! how to handle urls which contain parentheses inside? Oh god, css does not support such kind of urls
195
+ # I tested such url in css, and, unfortunately, the css rule is broken. LOL!
196
+ # I have to say that, CSS is awesome!
197
+ reg = re.compile(r'url\s*\((.+?)\)')
198
+
199
+ def repl(matchobj):
200
+ src = matchobj.group(1).strip(' \'"')
201
+ # if src.lower().endswith('woff') or src.lower().endswith('ttf') or src.lower().endswith('otf') or src.lower().endswith('eot'):
202
+ # # dont handle font data uri currently
203
+ # return 'url(' + src + ')'
204
+ return 'url(' + data_to_base64(index, src, verbose=verbose) + ')'
205
+
206
+ css = reg.sub(repl, css)
207
+ return css
208
+
209
+
210
+ def generate(index, verbose=True, comment=True, keep_script=False, prettify=False, full_url=True, verify=True,
211
+ errorpage=False, username=None, password=None, **kwargs):
212
+ """
213
+ given a index url such as http://www.google.com, http://custom.domain/index.html
214
+ return generated single html
215
+ """
216
+ html_doc, extra_data = get(index, verbose=verbose, verify=verify, ignore_error=errorpage,
217
+ username=username, password=password)
218
+
219
+ if extra_data and extra_data.get('url'):
220
+ index = extra_data['url']
221
+
222
+ # now build the dom tree
223
+ soup = BeautifulSoup(html_doc, 'lxml')
224
+ soup_title = soup.title.string if soup.title else ''
225
+
226
+ for link in soup('link'):
227
+ if link.get('href'):
228
+ if 'mask-icon' in (link.get('rel') or []) or 'icon' in (link.get('rel') or []) or 'apple-touch-icon' in (
229
+ link.get('rel') or []) or 'apple-touch-icon-precomposed' in (link.get('rel') or []):
230
+ link['data-href'] = link['href']
231
+
232
+ link['href'] = data_to_base64(index, link['href'], verbose=verbose)
233
+ elif link.get('type') == 'text/css' or link['href'].lower().endswith('.css') or 'stylesheet' in (
234
+ link.get('rel') or []):
235
+ new_type = 'text/css' if not link.get('type') else link['type']
236
+ css = soup.new_tag('style', type=new_type)
237
+ css['data-href'] = link['href']
238
+ for attr in link.attrs:
239
+ if attr in ['href']:
240
+ continue
241
+ css[attr] = link[attr]
242
+ css_data, _ = get(index, relpath=link['href'], verbose=verbose)
243
+ new_css_content = handle_css_content(absurl(index, link['href']), css_data, verbose=verbose)
244
+ # if "stylesheet/less" in '\n'.join(link.get('rel') or []).lower(): # fix browser side less: http://lesscss.org/#client-side-usage
245
+ # # link['href'] = 'data:text/less;base64,' + base64.b64encode(css_data)
246
+ # link['data-href'] = link['href']
247
+ # link['href'] = absurl(index, link['href'])
248
+ if False: # new_css_content.find('@font-face') > -1 or new_css_content.find('@FONT-FACE') > -1:
249
+ link['href'] = 'data:text/css;base64,' + base64.b64encode(new_css_content)
250
+ else:
251
+ css.string = new_css_content
252
+ link.replace_with(css)
253
+ elif full_url:
254
+ link['data-href'] = link['href']
255
+ link['href'] = absurl(index, link['href'])
256
+ for js in soup('script'):
257
+ if not keep_script:
258
+ js.replace_with('')
259
+ continue
260
+ if not js.get('src'):
261
+ continue
262
+ new_type = 'text/javascript' if not js.has_attr('type') or not js['type'] else js['type']
263
+ code = soup.new_tag('script', type=new_type)
264
+ code['data-src'] = js['src']
265
+ js_str, _ = get(index, relpath=js['src'], verbose=verbose)
266
+ if type(js_str) == bytes:
267
+ js_str = js_str.decode('utf-8')
268
+ try:
269
+ if js_str.find('</script>') > -1:
270
+ code['src'] = 'data:text/javascript;base64,' + base64.b64encode(js_str.encode()).decode()
271
+ elif js_str.find(']]>') < 0:
272
+ code.string = '<!--//--><![CDATA[//><!--\n' + js_str + '\n//--><!]]>'
273
+ else:
274
+ # replace ]]> does not work at all for chrome, do not believe
275
+ # http://en.wikipedia.org/wiki/CDATA
276
+ # code.string = '<![CDATA[\n' + js_str.replace(']]>', ']]]]><![CDATA[>') + '\n]]>'
277
+ code.string = js_str
278
+ except:
279
+ if verbose:
280
+ log(repr(js_str))
281
+ raise
282
+ js.replace_with(code)
283
+ for img in soup('img'):
284
+ if not img.get('src'):
285
+ continue
286
+ img['data-src'] = img['src']
287
+ img['src'] = data_to_base64(index, img['src'], verbose=verbose)
288
+
289
+ # `img` elements may have `srcset` attributes with multiple sets of images.
290
+ # To get a lighter document it will be cleared, and used only the standard `src` attribute
291
+ # Maybe add a flag to enable the base64 conversion of each `srcset`?
292
+ # For now a simple warning is displayed informing that image has multiple sources
293
+ # that are stripped.
294
+
295
+ if img.get('srcset'):
296
+ img['data-srcset'] = img['srcset']
297
+ del img['srcset']
298
+ if verbose:
299
+ log('[ WARN ] srcset found in img tag. Attribute will be cleared. File src => %s' % (img['data-src']),
300
+ 'yellow')
301
+
302
+ def check_alt(attr):
303
+ if img.has_attr(attr) and img[attr].startswith('this.src='):
304
+ # we do not handle this situation yet, just warn the user
305
+ if verbose:
306
+ log('[ WARN ] %s found in img tag and unhandled, which may break page' % (attr), 'yellow')
307
+
308
+ check_alt('onerror')
309
+ check_alt('onmouseover')
310
+ check_alt('onmouseout')
311
+ for tag in soup(True):
312
+ if full_url and tag.name == 'a' and tag.has_attr('href') and not tag['href'].startswith('#'):
313
+ tag['data-href'] = tag['href']
314
+ tag['href'] = absurl(index, tag['href'])
315
+ if tag.has_attr('style'):
316
+ if tag['style']:
317
+ tag['style'] = handle_css_content(index, tag['style'], verbose=verbose)
318
+ elif tag.name == 'link' and tag.has_attr('type') and tag['type'] == 'text/css':
319
+ if tag.string:
320
+ tag.string = handle_css_content(index, tag.string, verbose=verbose)
321
+ elif tag.name == 'style':
322
+ if tag.string:
323
+ tag.string = handle_css_content(index, tag.string, verbose=verbose)
324
+
325
+ # finally insert some info into comments
326
+ if comment:
327
+ for html in soup('html'):
328
+ html.insert(0, BeautifulSoup('<!-- \n single html processed by https://github.com/zTrix/webpage2html\n '
329
+ 'title: %s\n url: %s\n date: %s\n-->' % (soup_title, index, datetime.datetime.
330
+ now().ctime()), 'lxml'))
331
+ break
332
+ if prettify:
333
+ return soup.prettify(formatter='html')
334
+ else:
335
+ return str(soup)
336
+
337
+
338
+ def usage():
339
+ print("""
340
+ usage:
341
+
342
+ $ webpage2html [options] some_url
343
+
344
+ options:
345
+
346
+ -h, --help help page, you are reading this now!
347
+ -q, --quiet don't show verbose url get log in stderr
348
+ -s, --script keep javascript in the generated html
349
+
350
+ examples:
351
+
352
+ $ webpage2html -h
353
+ you are reading this help message
354
+
355
+ $ webpage2html http://www.google.com > google.html
356
+ save google index page for offline reading, keep style untainted
357
+
358
+ $ webpage2html -s http://gabrielecirulli.github.io/2048/ > 2048.html
359
+ save dynamic page with Javascript example
360
+ the 2048 game can be played offline after being saved
361
+
362
+ $ webpage2html /path/to/xxx.html > xxx_single.html
363
+ combine local saved xxx.html with a directory named xxx_files together into a single html file
364
+ """)
365
+
366
+
367
+ def main():
368
+ kwargs = {}
369
+ parser = argparse.ArgumentParser()
370
+ parser.add_argument('-q', '--quiet', action='store_true', help="don't show verbose url get log in stderr")
371
+ parser.add_argument('-s', '--script', action='store_true', help="keep javascript in the generated html")
372
+ parser.add_argument('-k', '--insecure', action='store_true', help="ignore the certificate")
373
+ parser.add_argument('-o', '--output', help="save output to")
374
+ parser.add_argument('-u', '--username', help="use HTTP basic auth with specified username")
375
+ parser.add_argument('-p', '--password', help="use HTTP basic auth with specified password")
376
+ parser.add_argument('--errorpage', action='store_true', help="crawl an error page")
377
+ parser.add_argument("url", help="the website to store")
378
+ args = parser.parse_args()
379
+
380
+ args.verbose = not args.quiet
381
+ args.keep_script = args.script
382
+ args.verify = not args.insecure
383
+ args.index = args.url
384
+ kwargs = vars(args)
385
+
386
+ rs = generate(**kwargs)
387
+ if args.output and args.output != '-':
388
+ with open(args.output, 'wb') as f:
389
+ f.write(rs.encode())
390
+ else:
391
+ sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
392
+ sys.stdout.write(rs)
393
+
394
+
395
+ if __name__ == '__main__':
396
+ main()