Spaces:

Awell00
/

book_summarizer

Paused

App Files Files Community

book_summarizer / epubsplit.py

Awell00

feat!: add all files

feb2065 verified about 1 year ago

raw

history blame contribute delete

53 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	__license__ = 'GPL v3'
	__copyright__ = '2021, Jim Miller'
	__docformat__ = 'restructuredtext en'

	import sys, re, os, traceback, copy
	from posixpath import normpath
	import logging
	logger = logging.getLogger(__name__)

	from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED

	from xml.dom.minidom import parse, parseString, getDOMImplementation, Element
	from time import time

	import six
	from six.moves.urllib.parse import unquote
	from six import string_types, text_type as unicode
	from six import unichr

	from bs4 import BeautifulSoup

	## font decoding code lifted from
	## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py
	## copyright '2009, Kovid Goyal <kovid@kovidgoyal.net>'
	## don't bug Kovid about this use of it.

	ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
	IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
	from itertools import cycle

	class FontDecrypter:
	def __init__(self, epub, content_dom):
	self.epub = epub
	self.content_dom = content_dom
	self.encryption = {}
	self.old_uuid = None

	def get_file(self,href):
	return self.epub.read(href)

	def get_encrypted_fontfiles(self):
	if not self.encryption:
	## Find the .opf file.
	try:
	# <encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
	# xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
	# xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
	# <enc:EncryptedData>
	# <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
	# <enc:CipherData>
	# <enc:CipherReference URI="fonts/00017.ttf"/>
	# </enc:CipherData>
	# </enc:EncryptedData>
	# </encryption>
	encryption = self.epub.read("META-INF/encryption.xml")
	encryptiondom = parseString(encryption)
	# print(encryptiondom.toprettyxml(indent=' '))
	for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'):
	# print(encdata.toprettyxml(indent=' '))
	algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm')
	if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
	print("Unknown font encryption: %s"%algorithm)
	else:
	# print(algorithm)
	for encref in encdata.getElementsByTagName('enc:CipherReference'):
	# print(encref.getAttribute('URI'))
	self.encryption[encref.getAttribute('URI')]=algorithm
	except KeyError as ke:
	self.encryption = {}
	return self.encryption

	def get_old_uuid(self):
	if not self.old_uuid:
	contentdom = self.content_dom
	uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier")
	for dcid in contentdom.getElementsByTagName("dc:identifier"):
	if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid":
	self.old_uuid = dcid.firstChild.data
	return self.old_uuid

	def get_idpf_key(self):
	# idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6
	# idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas'
	idpf_key = self.get_old_uuid()
	import uuid, hashlib
	idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
	idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
	return idpf_key

	def get_adobe_key(self):
	# adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6
	# adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6'
	adobe_key = self.get_old_uuid()
	import uuid
	adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid:
	adobe_key = uuid.UUID(adobe_key).bytes
	return adobe_key

	def get_decrypted_font_data(self, uri):
	# print(self.get_old_uuid())
	# print("idpf : %s"%self.get_idpf_key())
	# print("adobe: %s"%self.get_adobe_key())
	# print("uri:%s"%uri)
	font_data = self.get_file(uri)
	if uri in self.get_encrypted_fontfiles():
	key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key()
	font_data = self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri])
	return font_data

	def decrypt_font_data(self, key, data, algorithm):
	is_adobe = algorithm == ADOBE_OBFUSCATION
	crypt_len = 1024 if is_adobe else 1040
	crypt = bytearray(data[:crypt_len])
	key = cycle(iter(bytearray(key)))
	decrypt = bytes(bytearray(x^next(key) for x in crypt))
	return decrypt + data[crypt_len:]

	def _unirepl(match):
	"Return the unicode string for a decimal number"
	if match.group(1).startswith('x'):
	radix=16
	s = match.group(1)[1:]
	else:
	radix=10
	s = match.group(1)
	try:
	value = int(s, radix)
	retval = "%s%s"%(unichr(value),match.group(2))
	except:
	# This way, at least if there's more of entities out there
	# that fail, it doesn't blow the entire download.
	print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2)))
	retval = ""
	return retval

	def _replaceNumberEntities(data):
	# The same brokenish entity parsing in SGMLParser that inserts ';'
	# after non-entities will also insert ';' incorrectly after number
	# entities, including part of the next word if it's a-z.
	# "Don't&#8212ever&#8212do&#8212that&#8212again," becomes
	# "Don't&#8212e;ver&#8212d;o—that&#8212a;gain,"
	# Also need to allow for 5 digit decimal entities 法
	# Last expression didn't allow for 2 digit hex correctly: é
	p = re.compile(r'&#(x[0-9a-fA-F]{,4}\|[0-9]{,5})([0-9a-fA-F]*?);')
	return p.sub(_unirepl, data)

	def _replaceNotEntities(data):
	# not just \w or \S. regexp from c:\Python25\lib\sgmllib.py
	# (or equiv), SGMLParser, entityref
	p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
	return p.sub(r'&\1', data)

	def stripHTML(soup):
	return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()

	def conditionalRemoveEntities(value):
	if isinstance(value,string_types) :
	return removeEntities(value).strip()
	else:
	return value

	def removeAllEntities(text):
	# Remove < < and &
	return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&')

	def removeEntities(text):

	if text is None:
	return ""
	if not (isinstance(text,string_types)):
	return str(text)

	try:
	t = unicode(text) #.decode('utf-8')
	except UnicodeEncodeError as e:
	try:
	t = text.encode ('ascii', 'xmlcharrefreplace')
	except UnicodeEncodeError as e:
	t = text
	text = t
	# replace numeric versions of [&<>] with named versions,
	# then replace named versions with actual characters,
	text = re.sub(r'&#0*38;','&',text)
	text = re.sub(r'&#0*60;','<',text)
	text = re.sub(r'&#0*62;','>',text)

	# replace remaining entities with unicode value, such as ' -> '
	text = _replaceNumberEntities(text)

	# replace several named entities with character, such as — -> -
	# see constants.py for the list.
	# reverse sort will put entities with ; before the same one without, when valid.
	for e in reversed(sorted(entities.keys())):
	v = entities[e]
	try:
	text = text.replace(e, v)
	except UnicodeDecodeError as ex:
	# for the pound symbol in constants.py
	text = text.replace(e, v.decode('utf-8'))

	# SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
	# entities terribly well and inserts (;) after something that
	# it thinks might be an entity. AT&T becomes AT&T; All of my
	# attempts to fix this by changing the input to
	# BeautifulStoneSoup break something else instead. But at
	# this point, there should be no real entities left, so find
	# these not-entities and removing them here should be safe.
	text = _replaceNotEntities(text)

	# < < and & are the only html entities allowed in xhtml, put those back.
	return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>')

	# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
	entities = { 'á' : 'á',
	'Á' : 'Á',
	'&Aacute' : 'Á',
	'&aacute' : 'á',
	'â' : 'â',
	'Â' : 'Â',
	'&Acirc' : 'Â',
	'&acirc' : 'â',
	'´' : '´',
	'&acute' : '´',
	'Æ' : 'Æ',
	'æ' : 'æ',
	'&AElig' : 'Æ',
	'&aelig' : 'æ',
	'à' : 'à',
	'À' : 'À',
	'&Agrave' : 'À',
	'&agrave' : 'à',
	'&alefsym;' : 'ℵ',
	'α' : 'α',
	'Α' : 'Α',
	'&' : '&',
	'&AMP;' : '&',
	'&AMP' : '&',
	'&amp' : '&',
	'&and;' : '∧',
	'&ang;' : '∠',
	'å' : 'å',
	'Å' : 'Å',
	'&Aring' : 'Å',
	'&aring' : 'å',
	'≈' : '≈',
	'ã' : 'ã',
	'Ã' : 'Ã',
	'&Atilde' : 'Ã',
	'&atilde' : 'ã',
	'ä' : 'ä',
	'Ä' : 'Ä',
	'&Auml' : 'Ä',
	'&auml' : 'ä',
	'&bdquo;' : '„',
	'β' : 'β',
	'Β' : 'Β',
	'¦' : '¦',
	'&brvbar' : '¦',
	'•' : '•',
	'∩' : '∩',
	'ç' : 'ç',
	'Ç' : 'Ç',
	'&Ccedil' : 'Ç',
	'&ccedil' : 'ç',
	'¸' : '¸',
	'&cedil' : '¸',
	'¢' : '¢',
	'&cent' : '¢',
	'χ' : 'χ',
	'Χ' : 'Χ',
	'&circ;' : 'ˆ',
	'&clubs;' : '♣',
	'&cong;' : '≅',
	'©' : '©',
	'&COPY;' : '©',
	'&COPY' : '©',
	'&copy' : '©',
	'&crarr;' : '↵',
	'∪' : '∪',
	'¤' : '¤',
	'&curren' : '¤',
	'&dagger;' : '†',
	'&Dagger;' : '‡',
	'↓' : '↓',
	'&dArr;' : '⇓',
	'°' : '°',
	'&deg' : '°',
	'δ' : 'δ',
	'Δ' : 'Δ',
	'&diams;' : '♦',
	'÷' : '÷',
	'&divide' : '÷',
	'é' : 'é',
	'É' : 'É',
	'&Eacute' : 'É',
	'&eacute' : 'é',
	'ê' : 'ê',
	'Ê' : 'Ê',
	'&Ecirc' : 'Ê',
	'&ecirc' : 'ê',
	'è' : 'è',
	'È' : 'È',
	'&Egrave' : 'È',
	'&egrave' : 'è',
	'∅' : '∅',
	'&emsp;' : ' ',
	'&ensp;' : ' ',
	'ε' : 'ε',
	'Ε' : 'Ε',
	'&equiv;' : '≡',
	'η' : 'η',
	'Η' : 'Η',
	'ð' : 'ð',
	'Ð' : 'Ð',
	'&ETH' : 'Ð',
	'&eth' : 'ð',
	'ë' : 'ë',
	'Ë' : 'Ë',
	'&Euml' : 'Ë',
	'&euml' : 'ë',
	'€' : '€',
	'∃' : '∃',
	'&fnof;' : 'ƒ',
	'∀' : '∀',
	'½' : '½',
	'&frac12' : '½',
	'¼' : '¼',
	'&frac14' : '¼',
	'¾' : '¾',
	'&frac34' : '¾',
	'&frasl;' : '⁄',
	'γ' : 'γ',
	'Γ' : 'Γ',
	'≥' : '≥',
	#'>' : '>',
	#'&GT;' : '>',
	#'&GT' : '>',
	#'&gt' : '>',
	'↔' : '↔',
	'⇔' : '⇔',
	'&hearts;' : '♥',
	'…' : '…',
	'í' : 'í',
	'Í' : 'Í',
	'&Iacute' : 'Í',
	'&iacute' : 'í',
	'î' : 'î',
	'Î' : 'Î',
	'&Icirc' : 'Î',
	'&icirc' : 'î',
	'¡' : '¡',
	'&iexcl' : '¡',
	'ì' : 'ì',
	'Ì' : 'Ì',
	'&Igrave' : 'Ì',
	'&igrave' : 'ì',
	'&image;' : 'ℑ',
	'∞' : '∞',
	'∫' : '∫',
	'ι' : 'ι',
	'Ι' : 'Ι',
	'¿' : '¿',
	'&iquest' : '¿',
	'∈' : '∈',
	'ï' : 'ï',
	'Ï' : 'Ï',
	'&Iuml' : 'Ï',
	'&iuml' : 'ï',
	'κ' : 'κ',
	'Κ' : 'Κ',
	'λ' : 'λ',
	'Λ' : 'Λ',
	'«' : '«',
	'&laquo' : '«',
	'←' : '←',
	'⇐' : '⇐',
	'&lceil;' : '⌈',
	'“' : '“',
	'≤' : '≤',
	'&lfloor;' : '⌊',
	'&lowast;' : '∗',
	'&loz;' : '◊',
	'&lrm;' : '‎',
	'&lsaquo;' : '‹',
	'‘' : '‘',
	#'<' : '<',
	#'&LT;' : '<',
	#'&LT' : '<',
	#'&lt' : '<',
	'¯' : '¯',
	'&macr' : '¯',
	'—' : '—',
	'µ' : 'µ',
	'&micro' : 'µ',
	'·' : '·',
	'&middot' : '·',
	'−' : '−',
	'μ' : 'μ',
	'Μ' : 'Μ',
	'∇' : '∇',
	' ' : ' ',
	'&nbsp' : ' ',
	'–' : '–',
	'≠' : '≠',
	'&ni;' : '∋',
	'¬' : '¬',
	'&not' : '¬',
	'∉' : '∉',
	'&nsub;' : '⊄',
	'ñ' : 'ñ',
	'Ñ' : 'Ñ',
	'&Ntilde' : 'Ñ',
	'&ntilde' : 'ñ',
	'ν' : 'ν',
	'Ν' : 'Ν',
	'ó' : 'ó',
	'Ó' : 'Ó',
	'&Oacute' : 'Ó',
	'&oacute' : 'ó',
	'ô' : 'ô',
	'Ô' : 'Ô',
	'&Ocirc' : 'Ô',
	'&ocirc' : 'ô',
	'&OElig;' : 'Œ',
	'&oelig;' : 'œ',
	'ò' : 'ò',
	'Ò' : 'Ò',
	'&Ograve' : 'Ò',
	'&ograve' : 'ò',
	'&oline;' : '‾',
	'ω' : 'ω',
	'Ω' : 'Ω',
	'ο' : 'ο',
	'Ο' : 'Ο',
	'&oplus;' : '⊕',
	'&or;' : '∨',
	'ª' : 'ª',
	'&ordf' : 'ª',
	'º' : 'º',
	'&ordm' : 'º',
	'ø' : 'ø',
	'Ø' : 'Ø',
	'&Oslash' : 'Ø',
	'&oslash' : 'ø',
	'õ' : 'õ',
	'Õ' : 'Õ',
	'&Otilde' : 'Õ',
	'&otilde' : 'õ',
	'&otimes;' : '⊗',
	'ö' : 'ö',
	'Ö' : 'Ö',
	'&Ouml' : 'Ö',
	'&ouml' : 'ö',
	'¶' : '¶',
	'&para' : '¶',
	'∂' : '∂',
	'&permil;' : '‰',
	'&perp;' : '⊥',
	'φ' : 'φ',
	'Φ' : 'Φ',
	'π' : 'π',
	'Π' : 'Π',
	'ϖ' : 'ϖ',
	'±' : '±',
	'&plusmn' : '±',
	'£' : '£',
	'&pound' : '£',
	'′' : '′',
	'″' : '″',
	'∏' : '∏',
	'&prop;' : '∝',
	'ψ' : 'ψ',
	'Ψ' : 'Ψ',
	'"' : '"',
	'&QUOT;' : '"',
	'&QUOT' : '"',
	'&quot' : '"',
	'√' : '√',
	'»' : '»',
	'&raquo' : '»',
	'→' : '→',
	'⇒' : '⇒',
	'&rceil;' : '⌉',
	'”' : '”',
	'&real;' : 'ℜ',
	'®' : '®',
	'&REG;' : '®',
	'&REG' : '®',
	'&reg' : '®',
	'&rfloor;' : '⌋',
	'ρ' : 'ρ',
	'Ρ' : 'Ρ',
	'&rlm;' : '‏',
	'&rsaquo;' : '›',
	'’' : '’',
	'&sbquo;' : '‚',
	'&scaron;' : 'š',
	'&Scaron;' : 'Š',
	'⋅' : '⋅',
	'§' : '§',
	'&sect' : '§',
	'' : '', # strange optional hyphenation control character, not just a dash
	'&shy' : '',
	'σ' : 'σ',
	'Σ' : 'Σ',
	'&sigmaf;' : 'ς',
	'&sim;' : '∼',
	'&spades;' : '♠',
	'⊂' : '⊂',
	'&sube;' : '⊆',
	'∑' : '∑',
	'¹' : '¹',
	'&sup1' : '¹',
	'²' : '²',
	'&sup2' : '²',
	'³' : '³',
	'&sup3' : '³',
	'⊃' : '⊃',
	'&supe;' : '⊇',
	'ß' : 'ß',
	'&szlig' : 'ß',
	'τ' : 'τ',
	'Τ' : 'Τ',
	'&there4;' : '∴',
	'θ' : 'θ',
	'Θ' : 'Θ',
	'&thetasym;' : 'ϑ',
	' ' : ' ',
	'þ' : 'þ',
	'Þ' : 'Þ',
	'&THORN' : 'Þ',
	'&thorn' : 'þ',
	'&tilde;' : '˜',
	'×' : '×',
	'&times' : '×',
	'™' : '™',
	'ú' : 'ú',
	'Ú' : 'Ú',
	'&Uacute' : 'Ú',
	'&uacute' : 'ú',
	'↑' : '↑',
	'&uArr;' : '⇑',
	'û' : 'û',
	'Û' : 'Û',
	'&Ucirc' : 'Û',
	'&ucirc' : 'û',
	'ù' : 'ù',
	'Ù' : 'Ù',
	'&Ugrave' : 'Ù',
	'&ugrave' : 'ù',
	'¨' : '¨',
	'&uml' : '¨',
	'&upsih;' : 'ϒ',
	'υ' : 'υ',
	'Υ' : 'Υ',
	'ü' : 'ü',
	'Ü' : 'Ü',
	'&Uuml' : 'Ü',
	'&uuml' : 'ü',
	'&weierp;' : '℘',
	'ξ' : 'ξ',
	'Ξ' : 'Ξ',
	'ý' : 'ý',
	'Ý' : 'Ý',
	'&Yacute' : 'Ý',
	'&yacute' : 'ý',
	'¥' : '¥',
	'&yen' : '¥',
	'ÿ' : 'ÿ',
	'&Yuml;' : 'Ÿ',
	'&yuml' : 'ÿ',
	'ζ' : 'ζ',
	'Ζ' : 'Ζ',
	'&zwj;' : '‍', # strange spacing control character, not just a space
	'&zwnj;' : '‌', # strange spacing control character, not just a space
	}

	class SplitEpub:

	def __init__(self, inputio):
	self.epub = ZipFile(inputio, 'r')
	self.content_dom = None
	self.content_relpath = None
	self.manifest_items = None
	self.guide_items = None
	self.toc_dom = None
	self.toc_relpath = None
	self.toc_map = None
	self.split_lines = None
	self.origauthors = []
	self.origtitle = None

	def get_file(self,href):
	return self.epub.read(href)

	def get_content_dom(self):
	if not self.content_dom:
	## Find the .opf file.
	container = self.epub.read("META-INF/container.xml")
	containerdom = parseString(container)
	rootfilenodelist = containerdom.getElementsByTagName("rootfile")
	rootfilename = rootfilenodelist[0].getAttribute("full-path")

	self.content_dom = parseString(self.epub.read(rootfilename))
	self.content_relpath = get_path_part(rootfilename)
	return self.content_dom

	def get_content_relpath(self):
	## Save the path to the .opf file--hrefs inside it are relative to it.
	if not self.content_relpath:
	self.get_content_dom() # sets self.content_relpath also.
	return self.content_relpath

	def get_toc_relpath(self):
	## Save the path to the toc.ncx file--hrefs inside it are relative to it.
	if not self.toc_relpath:
	self.get_manifest_items() # sets self.toc_relpath also.
	return self.toc_relpath

	def get_manifest_items(self):
	if not self.manifest_items:
	self.manifest_items = {}

	for item in self.get_content_dom().getElementsByTagName("item"):
	fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
	#print("---- item fullhref:%s"%(fullhref))
	self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type"))
	self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type"))

	if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
	# TOC file is only one with this type--as far as I know.
	self.toc_relpath = get_path_part(fullhref)
	self.toc_dom = parseString(self.epub.read(fullhref))

	return self.manifest_items

	def get_guide_items(self):
	if not self.guide_items:
	self.guide_items = {}

	for item in self.get_content_dom().getElementsByTagName("reference"):
	fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
	self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title"))
	#print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],))
	#self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type"))

	return self.guide_items

	def get_toc_dom(self):
	if not self.toc_dom:
	self.get_manifest_items() # also sets self.toc_dom
	return self.toc_dom

	# dict() of href->[(text,anchor),...],...
	# eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],...
	def get_toc_map(self):
	if not self.toc_map:
	self.toc_map = {}
	# update all navpoint ids with bookid for uniqueness.
	for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"):
	src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src")))
	if '#' in src:
	(href,anchor)=src.split("#")
	else:
	(href,anchor)=(src,None)

	# The first of these in each navPoint should be the appropriate one.
	# (may be others due to nesting.
	try:
	text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data)
	except:
	#print("No chapter title found in TOC for (%s)"%src)
	text = ""

	if href not in self.toc_map:
	self.toc_map[href] = []
	if anchor == None:
	# put file links ahead of ancher links. Otherwise
	# a non-linear anchor link may take precedence,
	# which will confuse EpubSplit. This will cause
	# split lines to possibly be out of order from
	# TOC, but the alternative is worse. Should be a
	# rare corner case.
	## Keep order of non-anchor entries to the same file.
	idx=0
	while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor
	# print(idx)
	# print(self.toc_map[href][idx])
	idx = idx+1
	self.toc_map[href].insert(idx,(text,anchor))
	else:
	self.toc_map[href].append((text,anchor))
	# print(self.toc_map)
	return self.toc_map

	# list of dicts with href, anchor & toc text.
	# 'split lines' are all the points that the epub can be split on.
	# Offer a split at each spine file and each ToC point.
	def get_split_lines(self):

	metadom = self.get_content_dom()
	## Save indiv book title
	try:
	self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data
	except:
	self.origtitle = "(Title Missing)"

	## Save authors.
	for creator in metadom.getElementsByTagName("dc:creator"):
	try:
	if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
	if creator.firstChild.data not in self.origauthors:
	self.origauthors.append(creator.firstChild.data)
	except:
	pass
	if len(self.origauthors) == 0:
	self.origauthors.append("(Authors Missing)")

	self.split_lines = [] # list of dicts with href, anchor and toc
	# spin on spine files.
	count=0
	for itemref in metadom.getElementsByTagName("itemref"):
	idref = itemref.getAttribute("idref")
	(href,type) = self.get_manifest_items()["i:"+idref]
	current = {}
	self.split_lines.append(current)
	current['href']=href
	current['anchor']=None
	current['toc'] = []
	if href in self.get_guide_items():
	current['guide'] = self.get_guide_items()[href]
	current['id'] = idref
	current['type'] = type
	current['num'] = count
	t=self.epub.read(href).decode('utf-8')
	if len(t) > 1500 : t = t[:1500] + "..."
	current['sample']=t
	count += 1
	#print("spine:%s->%s"%(idref,href))

	# if href is in the toc.
	if href in self.get_toc_map():
	# For each toc entry, check to see if there's an anchor, if so,
	# make a new split line.
	for tocitem in self.get_toc_map()[href]:
	(text,anchor) = tocitem
	# XXX for outputing to screen in CLI--hopefully won't need in plugin?
	try:
	text = "%s"%text
	except:
	text = "(error text)"

	if anchor:
	#print("breakpoint: %d"%count)
	current = {}
	self.split_lines.append(current)
	current['href']=href
	current['anchor']=anchor
	current['toc']=[]
	current['id'] = idref
	current['type'] = type
	current['num'] = count
	# anchor, need to split first, then reduce to 1500.
	t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False)
	if len(t) > 1500 : t = t[:1500] + "..."
	current['sample']=t
	count += 1
	# There can be more than one toc to the same split line.
	# This won't find multiple toc to the same anchor yet.
	current['toc'].append(text)
	#print("\ttoc:'%s' %s#%s"%(text,href,anchor))
	return self.split_lines

	# pass in list of line numbers(?)
	def get_split_files(self,linenums):

	self.filecache = FileCache(self.get_manifest_items())

	# set include flag in split_lines.
	if not self.split_lines:
	self.get_split_lines()
	lines = self.split_lines

	lines_set = set([int(k) for k in linenums])
	for j in range(len(lines)):
	lines[j]['include'] = j in lines_set

	# loop through finding 'chunks' -- contiguous pieces in the
	# same file. Each included file is at least one chunk, but if
	# parts are left out, one original file can end up being more
	# than one chunk.
	outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive.
	inchunk = False
	currentfile = None
	start = None
	for line in lines:
	if line['include']:
	if not inchunk: # start new chunk
	inchunk = True
	currentfile = line['href']
	start = line
	else: # inchunk
	# different file, new chunk.
	if currentfile != line['href']:
	outchunks.append((currentfile,start,line))
	inchunk=True
	currentfile=line['href']
	start=line
	else: # not include
	if inchunk: # save previous chunk.
	outchunks.append((currentfile,start,line))
	inchunk=False

	# final chunk for when last in list is include.
	if inchunk:
	outchunks.append((currentfile,start,None))

	outfiles=[] # tuples, (filename,type,data) -- filename changed to unique
	for (href,start,end) in outchunks:
	filedata = self.epub.read(href).decode('utf-8')

	# discard before start if anchor.
	if start['anchor'] != None:
	filedata = splitHtml(filedata,start['anchor'],before=False)

	# discard from end anchor on(inclusive), but only if same file. If
	# different file, keep rest of file. If no 'end', then it was the
	# last chunk and went to the end of the last file.
	if end != None and end['anchor'] != None and end['href']==href:
	filedata = splitHtml(filedata,end['anchor'],before=True)

	filename = self.filecache.add_content_file(href,filedata)
	outfiles.append([filename,start['id'],start['type'],filedata])

	# print("self.oldnew:%s"%self.filecache.oldnew)
	# print("self.newold:%s"%self.filecache.newold)
	# print("\nanchors:%s\n"%self.filecache.anchors)
	# print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles)
	# print("relpath:%s"%get_path_part())

	# Spin through to replace internal URLs
	for fl in outfiles:
	#print("file:%s"%fl[0])
	soup = BeautifulSoup(fl[3],'html5lib')
	changed = False
	for a in soup.findAll('a'):
	if a.has_attr('href'):
	path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href'])))
	#print("full a['href']:%s"%path)
	if path in self.filecache.anchors and self.filecache.anchors[path] != path:
	a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):]
	#print("replacement path:%s"%a['href'])
	changed = True
	if changed:
	fl[3] = unicode(soup)

	return outfiles

	def write_split_epub(self,
	outputio,
	linenums,
	changedtocs={},
	authoropts=[],
	titleopt=None,
	descopt=None,
	tags=[],
	languages=['en'],
	coverjpgpath=None):

	files = self.get_split_files(linenums)

	## Write mimetype file, must be first and uncompressed.
	## Older versions of python(2.4/5) don't allow you to specify
	## compression by individual file.
	## Overwrite if existing output file.
	outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
	outputepub.debug = 3
	outputepub.writestr("mimetype", "application/epub+zip")
	outputepub.close()

	## Re-open file for content.
	outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
	outputepub.debug = 3

	## Create META-INF/container.xml file. The only thing it does is
	## point to content.opf
	containerdom = getDOMImplementation().createDocument(None, "container", None)
	containertop = containerdom.documentElement
	containertop.setAttribute("version","1.0")
	containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
	rootfiles = containerdom.createElement("rootfiles")
	containertop.appendChild(rootfiles)
	rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
	"media-type":"application/oebps-package+xml"}))
	outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8'))


	#### ## create content.opf file.
	uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme.
	contentdom = getDOMImplementation().createDocument(None, "package", None)
	package = contentdom.documentElement

	package.setAttribute("version","2.0")
	package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
	package.setAttribute("unique-identifier","epubsplit-id")
	metadata=newTag(contentdom,"metadata",
	attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
	"xmlns:opf":"http://www.idpf.org/2007/opf"})
	metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"}))
	if( titleopt is None ):
	titleopt = self.origtitle+" Split"
	metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))

	if( authoropts and len(authoropts) > 0 ):
	useauthors=authoropts
	else:
	useauthors=self.origauthors

	usedauthors=dict()
	for author in useauthors:
	if( author not in usedauthors ):
	usedauthors[author]=author
	metadata.appendChild(newTag(contentdom,"dc:creator",
	attrs={"opf:role":"aut"},
	text=author))

	metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"}))
	metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))

	if languages:
	for l in languages:
	metadata.appendChild(newTag(contentdom,"dc:language",text=l))
	else:
	metadata.appendChild(newTag(contentdom,"dc:language",text="en"))

	if not descopt:
	# created now, but not filled in until TOC generation to save loops.
	description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors)))
	else:
	description = newTag(contentdom,"dc:description",text=descopt)
	metadata.appendChild(description)

	for tag in tags:
	metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))

	package.appendChild(metadata)

	manifest = contentdom.createElement("manifest")
	package.appendChild(manifest)
	spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
	package.appendChild(spine)

	manifest.appendChild(newTag(contentdom,"item",
	attrs={'id':'ncx',
	'href':'toc.ncx',
	'media-type':'application/x-dtbncx+xml'}))

	if coverjpgpath:
	# <meta name="cover" content="cover.jpg"/>
	metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
	"content":"coverimageid"}))
	# cover stuff for later:
	# at end of <package>:
	# <guide>
	# <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
	# </guide>
	guide = newTag(contentdom,"guide")
	guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
	"title":"Cover",
	"href":"cover.xhtml"}))
	package.appendChild(guide)

	manifest.appendChild(newTag(contentdom,"item",
	attrs={'id':"coverimageid",
	'href':"cover.jpg",
	'media-type':"image/jpeg"}))

	# Note that the id of the cover xhmtl must be 'cover'
	# for it to work on Nook.
	manifest.appendChild(newTag(contentdom,"item",
	attrs={'id':"cover",
	'href':"cover.xhtml",
	'media-type':"application/xhtml+xml"}))

	spine.appendChild(newTag(contentdom,"itemref",
	attrs={"idref":"cover",
	"linear":"yes"}))

	contentcount=0
	for (filename,id,type,filedata) in files:
	#filename = self.filecache.addHtml(href,filedata)
	#print("writing :%s"%filename)
	# add to manifest and spine

	if coverjpgpath and filename == "cover.xhtml":
	continue # don't dup cover.

	outputepub.writestr(filename,filedata.encode('utf-8'))
	id = "a%d"%contentcount
	contentcount += 1
	manifest.appendChild(newTag(contentdom,"item",
	attrs={'id':id,
	'href':filename,
	'media-type':type}))
	spine.appendChild(newTag(contentdom,"itemref",
	attrs={"idref":id,
	"linear":"yes"}))

	fontdecrypter = FontDecrypter(self.epub,self.get_content_dom())
	linked=''
	for (linked,type) in self.filecache.linkedfiles:
	# print("linked files:(%s,%s)"%(linked,type))
	# add to manifest
	if coverjpgpath and linked == "cover.jpg":
	continue # don't dup cover.

	try:
	linkeddata = self.get_file(linked)
	if linked in fontdecrypter.get_encrypted_fontfiles():
	print("Decrypting font file: %s"%linked)
	linkeddata = fontdecrypter.get_decrypted_font_data(linked)
	outputepub.writestr(linked,linkeddata)

	except Exception as e:
	print("Skipping linked file (%s)\nException: %s"%(linked,e))

	id = "a%d"%contentcount
	contentcount += 1
	manifest.appendChild(newTag(contentdom,"item",
	attrs={'id':id,
	'href':linked,
	'media-type':type}))

	contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8'
	# tweak for brain damaged Nook STR. Nook insists on name before content.
	contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>',
	'<meta name="cover" content="coverimageid"/>')
	outputepub.writestr("content.opf",contentxml)

	## create toc.ncx file
	tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
	ncx = tocncxdom.documentElement
	ncx.setAttribute("version","2005-1")
	ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
	head = tocncxdom.createElement("head")
	ncx.appendChild(head)
	head.appendChild(newTag(tocncxdom,"meta",
	attrs={"name":"dtb:uid", "content":uniqueid}))
	depthnode = newTag(tocncxdom,"meta",
	attrs={"name":"dtb:depth", "content":"1"})
	head.appendChild(depthnode)
	head.appendChild(newTag(tocncxdom,"meta",
	attrs={"name":"dtb:totalPageCount", "content":"0"}))
	head.appendChild(newTag(tocncxdom,"meta",
	attrs={"name":"dtb:maxPageNumber", "content":"0"}))

	docTitle = tocncxdom.createElement("docTitle")
	docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt)))
	ncx.appendChild(docTitle)

	tocnavMap = tocncxdom.createElement("navMap")
	ncx.appendChild(tocnavMap)

	# come back to lines again for TOC because files only has files(gasp-shock!)
	count=1
	for line in self.split_lines:
	if line['include']:
	# if changed, use only changed values.
	if line['num'] in changedtocs:
	line['toc'] = changedtocs[line['num']]
	# can have more than one toc entry.
	for title in line['toc']:
	newnav = newTag(tocncxdom,"navPoint",
	{"id":"a%03d"%count,"playOrder":"%d" % count})
	count += 1
	tocnavMap.appendChild(newnav)
	navlabel = newTag(tocncxdom,"navLabel")
	newnav.appendChild(navlabel)
	# For purposes of TOC titling & desc, use first book author
	navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title)))
	# Find the first 'spine' item's content for the title navpoint.
	# Many epubs have the first chapter as first navpoint, so we can't just
	# copy that anymore.
	if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors:
	src = self.filecache.anchors[line['href']+"#"+line['anchor']]
	#print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src))
	else:
	#print("toc from href(%s)"%line['href'])
	src = line['href']
	newnav.appendChild(newTag(tocncxdom,"content",
	{"src":src}))

	outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8'))

	if coverjpgpath:
	# write, not write string. Pulling from file.
	outputepub.write(coverjpgpath,"cover.jpg")

	outputepub.writestr("cover.xhtml",'''
	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
	@page {padding: 0pt; margin:0pt}
	body { text-align: center; padding:0pt; margin: 0pt; }
	div { margin: 0pt; padding: 0pt; }
	</style></head><body><div>
	<img src="cover.jpg" alt="cover"/>
	</div></body></html>
	''')

	# declares all the files created by Windows. otherwise, when
	# it runs in appengine, windows unzips the files as 000 perms.
	for zf in outputepub.filelist:
	zf.create_system = 0
	outputepub.close()

	class FileCache:

	def __init__(self,manifest_items={}):
	self.manifest_items = manifest_items
	self.oldnew = {}
	self.newold = {}
	self.anchors = {}
	self.linkedfiles = set()

	## always include font files for embedded fonts
	for key, value in six.iteritems(self.manifest_items):
	# print("manifest:%s %s"%(key,value))
	if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype',
	'application/x-font-ttf',
	'application/x-font-truetype',
	'application/font-sfnt'):
	self.add_linked_file(value[0])

	def add_linked_file(self, href):
	href = normpath(unquote(href)) # fix %20 & /../
	if ("h:"+href) in self.manifest_items:
	type = self.manifest_items["h:"+href][1]
	else:
	type = 'unknown'
	self.linkedfiles.add((href,type))

	def add_content_file(self, href, filedata):

	changedname = False
	if href not in self.oldnew:
	self.oldnew[href]=[]
	newfile = href
	else:
	changedname = True
	newfile = "%s%d-%s"%(get_path_part(href),
	len(self.oldnew[href]),
	get_file_part(href))

	self.oldnew[href].append(newfile)
	self.newold[newfile]=href
	#print("newfile:%s"%newfile)

	soup = BeautifulSoup(filedata,'html5lib')
	#print("soup head:%s"%soup.find('head'))

	# same name? Don't need to worry about changing links to anchors
	for a in soup.findAll(): # not just 'a', any tag.
	#print("a:%s"%a)
	if a.has_attr('id'):
	self.anchors[href+'#'+a['id']]=newfile+'#'+a['id']

	# <image> from baen epub.
	# <image width="462" height="616" xlink:href="cover.jpeg"/>
	for img in soup.findAll('img') + soup.findAll('image'):
	src = None
	if img.has_attr('src'):
	src=img['src']
	if img.has_attr('xlink:href'):
	src=img['xlink:href']
	if src:
	self.add_linked_file(get_path_part(href)+src)
	else:
	logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img))

	# link href="0.css" type="text/css"
	for style in soup.findAll('link',{'type':'text/css'}):
	#print("link:%s"%style)
	if style.has_attr('href'):
	self.add_linked_file(get_path_part(href)+style['href'])

	return newfile

	def splitHtml(data,tagid,before=False):
	soup = BeautifulSoup(data,'lxml')
	#print("splitHtml.soup head:%s"%soup.find('head'))

	splitpoint = soup.find(id=tagid)

	#print("splitpoint:%s"%splitpoint)

	if splitpoint == None:
	return data

	if before:
	# remove all next siblings.
	for n in splitpoint.findNextSiblings():
	n.extract()

	parent = splitpoint.parent
	while parent and parent.name != 'body':
	for n in parent.findNextSiblings():
	n.extract()
	parent = parent.parent

	splitpoint.extract()
	else:
	# remove all prev siblings.
	for n in splitpoint.findPreviousSiblings():
	n.extract()

	parent = splitpoint.parent
	while parent and parent.name != 'body':
	for n in parent.findPreviousSiblings():
	n.extract()
	parent = parent.parent

	return re.sub(r'( *\r?\n)+','\r\n',unicode(soup))

	def get_path_part(n):
	relpath = os.path.dirname(n)
	if( len(relpath) > 0 ):
	relpath=relpath+"/"
	return relpath

	def get_file_part(n):
	return os.path.basename(n)

	## Utility method for creating new tags.
	def newTag(dom,name,attrs=None,text=None):
	tag = dom.createElement(name)
	if( attrs is not None ):
	for attr in attrs.keys():
	tag.setAttribute(attr,attrs[attr])
	if( text is not None ):
	tag.appendChild(dom.createTextNode(text))
	return tag

	def main(argv,usage=None):

	from optparse import OptionParser

	if not usage:
	# read in args, anything starting with -- will be treated as --<varible>=<value>
	usage = 'usage: python %prog'

	parser = OptionParser(usage+''' [options] <input epub> [line numbers...]

	Giving an epub without line numbers will return a list of line numbers: the
	possible split points in the input file. Calling with line numbers will
	generate an epub with each of the "lines" given included.''')

	parser.add_option("-o", "--output", dest="outputopt", default="split.epub",
	help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT")
	parser.add_option("--output-dir", dest="outputdiropt",
	help="Set OUTPUT directory, Default: presend working directory")
	parser.add_option('--split-by-section',
	action='store_true', dest='split_by_section',
	help='Create a new epub from each of the listed line sections instead of one containing all. Splits all sections if no lines numbers are given. Each split will be named <number>-<output name> and placed in the output-dir. Sections without a Table of Contents entry will be included with the preceding section(s)', )
	parser.add_option("-t", "--title", dest="titleopt", default=None,
	help="Use TITLE as the metadata title. Default: '<original epub title> Split' or ToC entry with --split-by-section", metavar="TITLE")
	parser.add_option("-d", "--description", dest="descopt", default=None,
	help="Use DESC as the metadata description. Default: 'Split from <epub title> by <author>'.", metavar="DESC")
	parser.add_option("-a", "--author",
	action="append", dest="authoropts", default=[],
	help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from original epub>", metavar="AUTHOR")
	parser.add_option("-g", "--tag",
	action="append", dest="tagopts", default=[],
	help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG")
	parser.add_option("-l", "--language",
	action="append", dest="languageopts", default=[],
	help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG")
	parser.add_option("-c", "--cover", dest="coveropt", default=None,
	help="Path to a jpg to use as cover image.", metavar="COVER")

	(options, args) = parser.parse_args(argv)

	## Add .epub if not already there.
	if not options.outputopt.lower().endswith(".epub"):
	options.outputopt=options.outputopt+".epub"

	if not options.languageopts:
	options.languageopts = ['en']

	if not args:
	parser.print_help()
	return

	epubO = SplitEpub(args[0])

	lines = epubO.get_split_lines()

	if options.split_by_section:
	if len(args) > 1:
	section_lines = args[1:]
	else:
	section_lines = range(len(lines))

	splitslist = []
	sectionlist = []
	title=None
	for lineno in section_lines:
	toclist = lines[int(lineno)]['toc']
	if sectionlist and not toclist:
	sectionlist.append(lineno)
	else:
	## take title from (first) ToC if available, else titleopt (_ Split internally if None)
	title = (toclist[0] if toclist else options.titleopt)
	print("title: %s"%title)
	sectionlist=[lineno]
	splitslist.append((sectionlist,title))
	if sectionlist:
	splitslist.append((sectionlist,title))
	# print(splitslist)

	filecount = 1
	for sectionlist, title in splitslist:
	outputfile = "%0.4d-%s"%(filecount,options.outputopt)
	if options.outputdiropt:
	outputfile = os.path.join(options.outputdiropt,outputfile)
	print("output file: "+outputfile)
	epubO.write_split_epub(outputfile,
	sectionlist,
	authoropts=options.authoropts,
	titleopt=title,
	descopt=options.descopt,
	tags=options.tagopts,
	languages=options.languageopts,
	coverjpgpath=options.coveropt)
	filecount+=1
	return
	elif len(args) == 1:
	count = 0
	showlist=['toc','guide','anchor','id','href']
	for line in lines:
	print("\nLine Number: %d"%count)
	for s in showlist:
	if s in line and line[s]:
	print("\t%s: %s"%(s,line[s]))
	count += 1
	return

	if len(args) > 1:
	outputfile = options.outputopt
	if options.outputdiropt:
	outputfile = os.path.join(options.outputdiropt,outputfile)
	print("output file: "+outputfile)
	epubO.write_split_epub(outputfile,
	args[1:],
	authoropts=options.authoropts,
	titleopt=options.titleopt,
	descopt=options.descopt,
	tags=options.tagopts,
	languages=options.languageopts,
	coverjpgpath=options.coveropt)

	return

	if __name__ == "__main__":
	main(sys.argv[1:])