Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- requirements 2.txt +4 -0
- setup.py +42 -0
- webpage2html.py +396 -0
requirements 2.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
beautifulsoup4>=4.0.0
|
| 2 |
+
lxml>=3.4.4
|
| 3 |
+
requests>=2.5.2
|
| 4 |
+
termcolor>=1.1.0
|
setup.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from distutils.core import setup
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name='webpage2html',
|
| 5 |
+
version='0.3.7',
|
| 6 |
+
|
| 7 |
+
author='Wenlei Zhu',
|
| 8 |
+
author_email='i@ztrix.me',
|
| 9 |
+
url='https://github.com/zTrix/webpage2html',
|
| 10 |
+
|
| 11 |
+
license='LICENSE.txt',
|
| 12 |
+
keywords="webpage html convert",
|
| 13 |
+
description='Save/convert web pages to a single editable html file',
|
| 14 |
+
long_description='View https://github.com/zTrix/webpage2html for project description and usage',
|
| 15 |
+
|
| 16 |
+
py_modules=['webpage2html'],
|
| 17 |
+
|
| 18 |
+
# Refers to test/test.py
|
| 19 |
+
test_suite='test.test',
|
| 20 |
+
|
| 21 |
+
entry_points={
|
| 22 |
+
'console_scripts': [
|
| 23 |
+
'webpage2html=webpage2html:main'
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
classifiers=[
|
| 27 |
+
'Development Status :: 5 - Production/Stable',
|
| 28 |
+
'Environment :: Console',
|
| 29 |
+
'Intended Audience :: Developers',
|
| 30 |
+
'Operating System :: POSIX',
|
| 31 |
+
'Operating System :: MacOS :: MacOS X',
|
| 32 |
+
'Programming Language :: Python',
|
| 33 |
+
'Programming Language :: Python :: 2.6',
|
| 34 |
+
'Programming Language :: Python :: 2.7',
|
| 35 |
+
'Programming Language :: Python :: 3.5',
|
| 36 |
+
'Programming Language :: Python :: 3.6',
|
| 37 |
+
'Topic :: Software Development',
|
| 38 |
+
'Topic :: System',
|
| 39 |
+
'Topic :: Terminals',
|
| 40 |
+
'Topic :: Utilities',
|
| 41 |
+
],
|
| 42 |
+
)
|
webpage2html.py
ADDED
|
@@ -0,0 +1,396 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
from __future__ import print_function
|
| 5 |
+
|
| 6 |
+
import argparse
|
| 7 |
+
import base64
|
| 8 |
+
import codecs
|
| 9 |
+
import datetime
|
| 10 |
+
import os
|
| 11 |
+
import re
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
import requests
|
| 15 |
+
from bs4 import BeautifulSoup
|
| 16 |
+
from termcolor import colored
|
| 17 |
+
|
| 18 |
+
if sys.version > '3':
|
| 19 |
+
from urllib.parse import urlparse, urlunsplit, urljoin, quote
|
| 20 |
+
else:
|
| 21 |
+
from urlparse import urlparse, urlunsplit, urljoin
|
| 22 |
+
from urllib import quote
|
| 23 |
+
|
| 24 |
+
re_css_url = re.compile(r'(url\(.*?\))')
|
| 25 |
+
webpage2html_cache = {}
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def log(s, color=None, on_color=None, attrs=None, new_line=True):
|
| 29 |
+
if not color:
|
| 30 |
+
print(str(s), end=' ', file=sys.stderr)
|
| 31 |
+
else:
|
| 32 |
+
print(colored(str(s), color, on_color, attrs), end=' ', file=sys.stderr)
|
| 33 |
+
if new_line:
|
| 34 |
+
sys.stderr.write('\n')
|
| 35 |
+
sys.stderr.flush()
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def absurl(index, relpath=None, normpath=None):
|
| 39 |
+
if normpath is None:
|
| 40 |
+
normpath = lambda x: x
|
| 41 |
+
if index.lower().startswith('http') or (relpath and relpath.startswith('http')):
|
| 42 |
+
new = urlparse(urljoin(index, relpath))
|
| 43 |
+
return urlunsplit((new.scheme, new.netloc, normpath(new.path), new.query, ''))
|
| 44 |
+
# normpath不是函数,为什么这里一直用normpath(path)这种格式
|
| 45 |
+
# netloc contains basic auth, so do not use domain
|
| 46 |
+
else:
|
| 47 |
+
if relpath:
|
| 48 |
+
return normpath(os.path.join(os.path.dirname(index), relpath))
|
| 49 |
+
else:
|
| 50 |
+
return index
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get(index, relpath=None, verbose=True, usecache=True, verify=True, ignore_error=False, username=None, password=None):
|
| 54 |
+
global webpage2html_cache
|
| 55 |
+
if index.startswith('http') or (relpath and relpath.startswith('http')):
|
| 56 |
+
full_path = absurl(index, relpath)
|
| 57 |
+
if not full_path:
|
| 58 |
+
if verbose:
|
| 59 |
+
log('[ WARN ] invalid path, %s %s' % (index, relpath), 'yellow')
|
| 60 |
+
return '', None
|
| 61 |
+
# urllib2 only accepts valid url, the following code is taken from urllib
|
| 62 |
+
# http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780
|
| 63 |
+
full_path = quote(full_path, safe="%/:=&?~#+!$,;'@()*[]")
|
| 64 |
+
if usecache:
|
| 65 |
+
if full_path in webpage2html_cache:
|
| 66 |
+
if verbose:
|
| 67 |
+
log('[ CACHE HIT ] - %s' % full_path)
|
| 68 |
+
return webpage2html_cache[full_path], None
|
| 69 |
+
headers = {
|
| 70 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
auth = None
|
| 74 |
+
if username and password:
|
| 75 |
+
auth = requests.auth.HTTPBasicAuth(username, password)
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
response = requests.get(full_path, headers=headers, verify=verify, auth=auth)
|
| 79 |
+
if verbose:
|
| 80 |
+
log('[ GET ] %d - %s' % (response.status_code, response.url))
|
| 81 |
+
if not ignore_error and (response.status_code >= 400 or response.status_code < 200):
|
| 82 |
+
content = ''
|
| 83 |
+
elif response.headers.get('content-type', '').lower().startswith('text/'):
|
| 84 |
+
content = response.text
|
| 85 |
+
else:
|
| 86 |
+
content = response.content
|
| 87 |
+
if usecache:
|
| 88 |
+
webpage2html_cache[response.url] = content
|
| 89 |
+
return content, {'url': response.url, 'content-type': response.headers.get('content-type')}
|
| 90 |
+
except Exception as ex:
|
| 91 |
+
if verbose:
|
| 92 |
+
log('[ WARN ] %s - %s %s' % ('???', full_path, ex), 'yellow')
|
| 93 |
+
return '', None
|
| 94 |
+
elif os.path.exists(index):
|
| 95 |
+
if relpath:
|
| 96 |
+
relpath = relpath.split('#')[0].split('?')[0]
|
| 97 |
+
if os.path.exists(relpath):
|
| 98 |
+
full_path = relpath
|
| 99 |
+
else:
|
| 100 |
+
full_path = os.path.normpath(os.path.join(os.path.dirname(index), relpath))
|
| 101 |
+
try:
|
| 102 |
+
ret = open(full_path, 'rb').read()
|
| 103 |
+
if verbose:
|
| 104 |
+
log('[ LOCAL ] found - %s' % full_path)
|
| 105 |
+
return ret, None
|
| 106 |
+
except IOError as err:
|
| 107 |
+
if verbose:
|
| 108 |
+
log('[ WARN ] file not found - %s %s' % (full_path, str(err)), 'yellow')
|
| 109 |
+
return '', None
|
| 110 |
+
else:
|
| 111 |
+
try:
|
| 112 |
+
ret = open(index, 'rb').read()
|
| 113 |
+
if verbose:
|
| 114 |
+
log('[ LOCAL ] found - %s' % index)
|
| 115 |
+
return ret, None
|
| 116 |
+
except IOError as err:
|
| 117 |
+
if verbose:
|
| 118 |
+
log('[ WARN ] file not found - %s %s' % (index, str(err)), 'yellow')
|
| 119 |
+
return '', None
|
| 120 |
+
else:
|
| 121 |
+
if verbose:
|
| 122 |
+
log('[ ERROR ] invalid index - %s' % index, 'red')
|
| 123 |
+
return '', None
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def data_to_base64(index, src, verbose=True):
|
| 127 |
+
# doc here: http://en.wikipedia.org/wiki/Data_URI_scheme
|
| 128 |
+
sp = urlparse(src).path.lower()
|
| 129 |
+
if src.strip().startswith('data:'):
|
| 130 |
+
return src
|
| 131 |
+
if sp.endswith('.png'):
|
| 132 |
+
fmt = 'image/png'
|
| 133 |
+
elif sp.endswith('.gif'):
|
| 134 |
+
fmt = 'image/gif'
|
| 135 |
+
elif sp.endswith('.ico'):
|
| 136 |
+
fmt = 'image/x-icon'
|
| 137 |
+
elif sp.endswith('.jpg') or sp.endswith('.jpeg'):
|
| 138 |
+
fmt = 'image/jpg'
|
| 139 |
+
elif sp.endswith('.svg'):
|
| 140 |
+
fmt = 'image/svg+xml'
|
| 141 |
+
elif sp.endswith('.ttf'):
|
| 142 |
+
fmt = 'application/x-font-ttf'
|
| 143 |
+
elif sp.endswith('.otf'):
|
| 144 |
+
fmt = 'application/x-font-opentype'
|
| 145 |
+
elif sp.endswith('.woff'):
|
| 146 |
+
fmt = 'application/font-woff'
|
| 147 |
+
elif sp.endswith('.woff2'):
|
| 148 |
+
fmt = 'application/font-woff2'
|
| 149 |
+
elif sp.endswith('.eot'):
|
| 150 |
+
fmt = 'application/vnd.ms-fontobject'
|
| 151 |
+
elif sp.endswith('.sfnt'):
|
| 152 |
+
fmt = 'application/font-sfnt'
|
| 153 |
+
elif sp.endswith('.css') or sp.endswith('.less'):
|
| 154 |
+
fmt = 'text/css'
|
| 155 |
+
elif sp.endswith('.js'):
|
| 156 |
+
fmt = 'application/javascript'
|
| 157 |
+
else:
|
| 158 |
+
# what if it's not a valid font type? may not matter
|
| 159 |
+
fmt = 'image/png'
|
| 160 |
+
data, extra_data = get(index, src, verbose=verbose)
|
| 161 |
+
if extra_data and extra_data.get('content-type'):
|
| 162 |
+
fmt = extra_data.get('content-type').replace(' ', '')
|
| 163 |
+
if data:
|
| 164 |
+
if sys.version > '3':
|
| 165 |
+
if type(data) is bytes:
|
| 166 |
+
return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(data))
|
| 167 |
+
else:
|
| 168 |
+
return ('data:%s;base64,' % fmt) + bytes.decode(base64.b64encode(str.encode(data)))
|
| 169 |
+
else:
|
| 170 |
+
reload(sys)
|
| 171 |
+
sys.setdefaultencoding('utf-8')
|
| 172 |
+
return ('data:%s;base64,' % fmt) + base64.b64encode(data)
|
| 173 |
+
else:
|
| 174 |
+
return absurl(index, src)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
css_encoding_re = re.compile(r'''@charset\s+["']([-_a-zA-Z0-9]+)["']\;''', re.I)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
def handle_css_content(index, css, verbose=True):
|
| 181 |
+
if not css:
|
| 182 |
+
return css
|
| 183 |
+
if not isinstance(css, str):
|
| 184 |
+
if sys.version > '3':
|
| 185 |
+
css = bytes.decode(css)
|
| 186 |
+
mo = css_encoding_re.search(css)
|
| 187 |
+
else:
|
| 188 |
+
mo = css_encoding_re.search(css)
|
| 189 |
+
if mo:
|
| 190 |
+
try:
|
| 191 |
+
css = css.decode(mo.group(1))
|
| 192 |
+
except:
|
| 193 |
+
log('[ WARN ] failed to convert css to encoding %s' % mo.group(1), 'yellow')
|
| 194 |
+
# Watch out! how to handle urls which contain parentheses inside? Oh god, css does not support such kind of urls
|
| 195 |
+
# I tested such url in css, and, unfortunately, the css rule is broken. LOL!
|
| 196 |
+
# I have to say that, CSS is awesome!
|
| 197 |
+
reg = re.compile(r'url\s*\((.+?)\)')
|
| 198 |
+
|
| 199 |
+
def repl(matchobj):
|
| 200 |
+
src = matchobj.group(1).strip(' \'"')
|
| 201 |
+
# if src.lower().endswith('woff') or src.lower().endswith('ttf') or src.lower().endswith('otf') or src.lower().endswith('eot'):
|
| 202 |
+
# # dont handle font data uri currently
|
| 203 |
+
# return 'url(' + src + ')'
|
| 204 |
+
return 'url(' + data_to_base64(index, src, verbose=verbose) + ')'
|
| 205 |
+
|
| 206 |
+
css = reg.sub(repl, css)
|
| 207 |
+
return css
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def generate(index, verbose=True, comment=True, keep_script=False, prettify=False, full_url=True, verify=True,
|
| 211 |
+
errorpage=False, username=None, password=None, **kwargs):
|
| 212 |
+
"""
|
| 213 |
+
given a index url such as http://www.google.com, http://custom.domain/index.html
|
| 214 |
+
return generated single html
|
| 215 |
+
"""
|
| 216 |
+
html_doc, extra_data = get(index, verbose=verbose, verify=verify, ignore_error=errorpage,
|
| 217 |
+
username=username, password=password)
|
| 218 |
+
|
| 219 |
+
if extra_data and extra_data.get('url'):
|
| 220 |
+
index = extra_data['url']
|
| 221 |
+
|
| 222 |
+
# now build the dom tree
|
| 223 |
+
soup = BeautifulSoup(html_doc, 'lxml')
|
| 224 |
+
soup_title = soup.title.string if soup.title else ''
|
| 225 |
+
|
| 226 |
+
for link in soup('link'):
|
| 227 |
+
if link.get('href'):
|
| 228 |
+
if 'mask-icon' in (link.get('rel') or []) or 'icon' in (link.get('rel') or []) or 'apple-touch-icon' in (
|
| 229 |
+
link.get('rel') or []) or 'apple-touch-icon-precomposed' in (link.get('rel') or []):
|
| 230 |
+
link['data-href'] = link['href']
|
| 231 |
+
|
| 232 |
+
link['href'] = data_to_base64(index, link['href'], verbose=verbose)
|
| 233 |
+
elif link.get('type') == 'text/css' or link['href'].lower().endswith('.css') or 'stylesheet' in (
|
| 234 |
+
link.get('rel') or []):
|
| 235 |
+
new_type = 'text/css' if not link.get('type') else link['type']
|
| 236 |
+
css = soup.new_tag('style', type=new_type)
|
| 237 |
+
css['data-href'] = link['href']
|
| 238 |
+
for attr in link.attrs:
|
| 239 |
+
if attr in ['href']:
|
| 240 |
+
continue
|
| 241 |
+
css[attr] = link[attr]
|
| 242 |
+
css_data, _ = get(index, relpath=link['href'], verbose=verbose)
|
| 243 |
+
new_css_content = handle_css_content(absurl(index, link['href']), css_data, verbose=verbose)
|
| 244 |
+
# if "stylesheet/less" in '\n'.join(link.get('rel') or []).lower(): # fix browser side less: http://lesscss.org/#client-side-usage
|
| 245 |
+
# # link['href'] = 'data:text/less;base64,' + base64.b64encode(css_data)
|
| 246 |
+
# link['data-href'] = link['href']
|
| 247 |
+
# link['href'] = absurl(index, link['href'])
|
| 248 |
+
if False: # new_css_content.find('@font-face') > -1 or new_css_content.find('@FONT-FACE') > -1:
|
| 249 |
+
link['href'] = 'data:text/css;base64,' + base64.b64encode(new_css_content)
|
| 250 |
+
else:
|
| 251 |
+
css.string = new_css_content
|
| 252 |
+
link.replace_with(css)
|
| 253 |
+
elif full_url:
|
| 254 |
+
link['data-href'] = link['href']
|
| 255 |
+
link['href'] = absurl(index, link['href'])
|
| 256 |
+
for js in soup('script'):
|
| 257 |
+
if not keep_script:
|
| 258 |
+
js.replace_with('')
|
| 259 |
+
continue
|
| 260 |
+
if not js.get('src'):
|
| 261 |
+
continue
|
| 262 |
+
new_type = 'text/javascript' if not js.has_attr('type') or not js['type'] else js['type']
|
| 263 |
+
code = soup.new_tag('script', type=new_type)
|
| 264 |
+
code['data-src'] = js['src']
|
| 265 |
+
js_str, _ = get(index, relpath=js['src'], verbose=verbose)
|
| 266 |
+
if type(js_str) == bytes:
|
| 267 |
+
js_str = js_str.decode('utf-8')
|
| 268 |
+
try:
|
| 269 |
+
if js_str.find('</script>') > -1:
|
| 270 |
+
code['src'] = 'data:text/javascript;base64,' + base64.b64encode(js_str.encode()).decode()
|
| 271 |
+
elif js_str.find(']]>') < 0:
|
| 272 |
+
code.string = '<!--//--><![CDATA[//><!--\n' + js_str + '\n//--><!]]>'
|
| 273 |
+
else:
|
| 274 |
+
# replace ]]> does not work at all for chrome, do not believe
|
| 275 |
+
# http://en.wikipedia.org/wiki/CDATA
|
| 276 |
+
# code.string = '<![CDATA[\n' + js_str.replace(']]>', ']]]]><![CDATA[>') + '\n]]>'
|
| 277 |
+
code.string = js_str
|
| 278 |
+
except:
|
| 279 |
+
if verbose:
|
| 280 |
+
log(repr(js_str))
|
| 281 |
+
raise
|
| 282 |
+
js.replace_with(code)
|
| 283 |
+
for img in soup('img'):
|
| 284 |
+
if not img.get('src'):
|
| 285 |
+
continue
|
| 286 |
+
img['data-src'] = img['src']
|
| 287 |
+
img['src'] = data_to_base64(index, img['src'], verbose=verbose)
|
| 288 |
+
|
| 289 |
+
# `img` elements may have `srcset` attributes with multiple sets of images.
|
| 290 |
+
# To get a lighter document it will be cleared, and used only the standard `src` attribute
|
| 291 |
+
# Maybe add a flag to enable the base64 conversion of each `srcset`?
|
| 292 |
+
# For now a simple warning is displayed informing that image has multiple sources
|
| 293 |
+
# that are stripped.
|
| 294 |
+
|
| 295 |
+
if img.get('srcset'):
|
| 296 |
+
img['data-srcset'] = img['srcset']
|
| 297 |
+
del img['srcset']
|
| 298 |
+
if verbose:
|
| 299 |
+
log('[ WARN ] srcset found in img tag. Attribute will be cleared. File src => %s' % (img['data-src']),
|
| 300 |
+
'yellow')
|
| 301 |
+
|
| 302 |
+
def check_alt(attr):
|
| 303 |
+
if img.has_attr(attr) and img[attr].startswith('this.src='):
|
| 304 |
+
# we do not handle this situation yet, just warn the user
|
| 305 |
+
if verbose:
|
| 306 |
+
log('[ WARN ] %s found in img tag and unhandled, which may break page' % (attr), 'yellow')
|
| 307 |
+
|
| 308 |
+
check_alt('onerror')
|
| 309 |
+
check_alt('onmouseover')
|
| 310 |
+
check_alt('onmouseout')
|
| 311 |
+
for tag in soup(True):
|
| 312 |
+
if full_url and tag.name == 'a' and tag.has_attr('href') and not tag['href'].startswith('#'):
|
| 313 |
+
tag['data-href'] = tag['href']
|
| 314 |
+
tag['href'] = absurl(index, tag['href'])
|
| 315 |
+
if tag.has_attr('style'):
|
| 316 |
+
if tag['style']:
|
| 317 |
+
tag['style'] = handle_css_content(index, tag['style'], verbose=verbose)
|
| 318 |
+
elif tag.name == 'link' and tag.has_attr('type') and tag['type'] == 'text/css':
|
| 319 |
+
if tag.string:
|
| 320 |
+
tag.string = handle_css_content(index, tag.string, verbose=verbose)
|
| 321 |
+
elif tag.name == 'style':
|
| 322 |
+
if tag.string:
|
| 323 |
+
tag.string = handle_css_content(index, tag.string, verbose=verbose)
|
| 324 |
+
|
| 325 |
+
# finally insert some info into comments
|
| 326 |
+
if comment:
|
| 327 |
+
for html in soup('html'):
|
| 328 |
+
html.insert(0, BeautifulSoup('<!-- \n single html processed by https://github.com/zTrix/webpage2html\n '
|
| 329 |
+
'title: %s\n url: %s\n date: %s\n-->' % (soup_title, index, datetime.datetime.
|
| 330 |
+
now().ctime()), 'lxml'))
|
| 331 |
+
break
|
| 332 |
+
if prettify:
|
| 333 |
+
return soup.prettify(formatter='html')
|
| 334 |
+
else:
|
| 335 |
+
return str(soup)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def usage():
|
| 339 |
+
print("""
|
| 340 |
+
usage:
|
| 341 |
+
|
| 342 |
+
$ webpage2html [options] some_url
|
| 343 |
+
|
| 344 |
+
options:
|
| 345 |
+
|
| 346 |
+
-h, --help help page, you are reading this now!
|
| 347 |
+
-q, --quiet don't show verbose url get log in stderr
|
| 348 |
+
-s, --script keep javascript in the generated html
|
| 349 |
+
|
| 350 |
+
examples:
|
| 351 |
+
|
| 352 |
+
$ webpage2html -h
|
| 353 |
+
you are reading this help message
|
| 354 |
+
|
| 355 |
+
$ webpage2html http://www.google.com > google.html
|
| 356 |
+
save google index page for offline reading, keep style untainted
|
| 357 |
+
|
| 358 |
+
$ webpage2html -s http://gabrielecirulli.github.io/2048/ > 2048.html
|
| 359 |
+
save dynamic page with Javascript example
|
| 360 |
+
the 2048 game can be played offline after being saved
|
| 361 |
+
|
| 362 |
+
$ webpage2html /path/to/xxx.html > xxx_single.html
|
| 363 |
+
combine local saved xxx.html with a directory named xxx_files together into a single html file
|
| 364 |
+
""")
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def main():
|
| 368 |
+
kwargs = {}
|
| 369 |
+
parser = argparse.ArgumentParser()
|
| 370 |
+
parser.add_argument('-q', '--quiet', action='store_true', help="don't show verbose url get log in stderr")
|
| 371 |
+
parser.add_argument('-s', '--script', action='store_true', help="keep javascript in the generated html")
|
| 372 |
+
parser.add_argument('-k', '--insecure', action='store_true', help="ignore the certificate")
|
| 373 |
+
parser.add_argument('-o', '--output', help="save output to")
|
| 374 |
+
parser.add_argument('-u', '--username', help="use HTTP basic auth with specified username")
|
| 375 |
+
parser.add_argument('-p', '--password', help="use HTTP basic auth with specified password")
|
| 376 |
+
parser.add_argument('--errorpage', action='store_true', help="crawl an error page")
|
| 377 |
+
parser.add_argument("url", help="the website to store")
|
| 378 |
+
args = parser.parse_args()
|
| 379 |
+
|
| 380 |
+
args.verbose = not args.quiet
|
| 381 |
+
args.keep_script = args.script
|
| 382 |
+
args.verify = not args.insecure
|
| 383 |
+
args.index = args.url
|
| 384 |
+
kwargs = vars(args)
|
| 385 |
+
|
| 386 |
+
rs = generate(**kwargs)
|
| 387 |
+
if args.output and args.output != '-':
|
| 388 |
+
with open(args.output, 'wb') as f:
|
| 389 |
+
f.write(rs.encode())
|
| 390 |
+
else:
|
| 391 |
+
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
|
| 392 |
+
sys.stdout.write(rs)
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
if __name__ == '__main__':
|
| 396 |
+
main()
|