Spaces:
Paused
Paused
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| __license__ = 'GPL v3' | |
| __copyright__ = '2021, Jim Miller' | |
| __docformat__ = 'restructuredtext en' | |
| import sys, re, os, traceback, copy | |
| from posixpath import normpath | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED | |
| from xml.dom.minidom import parse, parseString, getDOMImplementation, Element | |
| from time import time | |
| import six | |
| from six.moves.urllib.parse import unquote | |
| from six import string_types, text_type as unicode | |
| from six import unichr | |
| from bs4 import BeautifulSoup | |
| ## font decoding code lifted from | |
| ## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py | |
| ## copyright '2009, Kovid Goyal <kovid@kovidgoyal.net>' | |
| ## don't bug Kovid about this use of it. | |
| ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' | |
| IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' | |
| from itertools import cycle | |
| class FontDecrypter: | |
| def __init__(self, epub, content_dom): | |
| self.epub = epub | |
| self.content_dom = content_dom | |
| self.encryption = {} | |
| self.old_uuid = None | |
| def get_file(self,href): | |
| return self.epub.read(href) | |
| def get_encrypted_fontfiles(self): | |
| if not self.encryption: | |
| ## Find the .opf file. | |
| try: | |
| # <encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container" | |
| # xmlns:enc="http://www.w3.org/2001/04/xmlenc#" | |
| # xmlns:deenc="http://ns.adobe.com/digitaleditions/enc"> | |
| # <enc:EncryptedData> | |
| # <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/> | |
| # <enc:CipherData> | |
| # <enc:CipherReference URI="fonts/00017.ttf"/> | |
| # </enc:CipherData> | |
| # </enc:EncryptedData> | |
| # </encryption> | |
| encryption = self.epub.read("META-INF/encryption.xml") | |
| encryptiondom = parseString(encryption) | |
| # print(encryptiondom.toprettyxml(indent=' ')) | |
| for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'): | |
| # print(encdata.toprettyxml(indent=' ')) | |
| algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm') | |
| if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: | |
| print("Unknown font encryption: %s"%algorithm) | |
| else: | |
| # print(algorithm) | |
| for encref in encdata.getElementsByTagName('enc:CipherReference'): | |
| # print(encref.getAttribute('URI')) | |
| self.encryption[encref.getAttribute('URI')]=algorithm | |
| except KeyError as ke: | |
| self.encryption = {} | |
| return self.encryption | |
| def get_old_uuid(self): | |
| if not self.old_uuid: | |
| contentdom = self.content_dom | |
| uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier") | |
| for dcid in contentdom.getElementsByTagName("dc:identifier"): | |
| if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid": | |
| self.old_uuid = dcid.firstChild.data | |
| return self.old_uuid | |
| def get_idpf_key(self): | |
| # idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6 | |
| # idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas' | |
| idpf_key = self.get_old_uuid() | |
| import uuid, hashlib | |
| idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) | |
| idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest() | |
| return idpf_key | |
| def get_adobe_key(self): | |
| # adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6 | |
| # adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6' | |
| adobe_key = self.get_old_uuid() | |
| import uuid | |
| adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid: | |
| adobe_key = uuid.UUID(adobe_key).bytes | |
| return adobe_key | |
| def get_decrypted_font_data(self, uri): | |
| # print(self.get_old_uuid()) | |
| # print("idpf : %s"%self.get_idpf_key()) | |
| # print("adobe: %s"%self.get_adobe_key()) | |
| # print("uri:%s"%uri) | |
| font_data = self.get_file(uri) | |
| if uri in self.get_encrypted_fontfiles(): | |
| key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key() | |
| font_data = self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri]) | |
| return font_data | |
| def decrypt_font_data(self, key, data, algorithm): | |
| is_adobe = algorithm == ADOBE_OBFUSCATION | |
| crypt_len = 1024 if is_adobe else 1040 | |
| crypt = bytearray(data[:crypt_len]) | |
| key = cycle(iter(bytearray(key))) | |
| decrypt = bytes(bytearray(x^next(key) for x in crypt)) | |
| return decrypt + data[crypt_len:] | |
| def _unirepl(match): | |
| "Return the unicode string for a decimal number" | |
| if match.group(1).startswith('x'): | |
| radix=16 | |
| s = match.group(1)[1:] | |
| else: | |
| radix=10 | |
| s = match.group(1) | |
| try: | |
| value = int(s, radix) | |
| retval = "%s%s"%(unichr(value),match.group(2)) | |
| except: | |
| # This way, at least if there's more of entities out there | |
| # that fail, it doesn't blow the entire download. | |
| print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2))) | |
| retval = "" | |
| return retval | |
| def _replaceNumberEntities(data): | |
| # The same brokenish entity parsing in SGMLParser that inserts ';' | |
| # after non-entities will also insert ';' incorrectly after number | |
| # entities, including part of the next word if it's a-z. | |
| # "Don't—ever—do—that—again," becomes | |
| # "Don't—e;ver—d;o—that—a;gain," | |
| # Also need to allow for 5 digit decimal entities 法 | |
| # Last expression didn't allow for 2 digit hex correctly: é | |
| p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);') | |
| return p.sub(_unirepl, data) | |
| def _replaceNotEntities(data): | |
| # not just \w or \S. regexp from c:\Python25\lib\sgmllib.py | |
| # (or equiv), SGMLParser, entityref | |
| p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);') | |
| return p.sub(r'&\1', data) | |
| def stripHTML(soup): | |
| return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip() | |
| def conditionalRemoveEntities(value): | |
| if isinstance(value,string_types) : | |
| return removeEntities(value).strip() | |
| else: | |
| return value | |
| def removeAllEntities(text): | |
| # Remove < < and & | |
| return removeEntities(text).replace('<', '<').replace('>', '>').replace('&', '&') | |
| def removeEntities(text): | |
| if text is None: | |
| return "" | |
| if not (isinstance(text,string_types)): | |
| return str(text) | |
| try: | |
| t = unicode(text) #.decode('utf-8') | |
| except UnicodeEncodeError as e: | |
| try: | |
| t = text.encode ('ascii', 'xmlcharrefreplace') | |
| except UnicodeEncodeError as e: | |
| t = text | |
| text = t | |
| # replace numeric versions of [&<>] with named versions, | |
| # then replace named versions with actual characters, | |
| text = re.sub(r'�*38;','&',text) | |
| text = re.sub(r'�*60;','<',text) | |
| text = re.sub(r'�*62;','>',text) | |
| # replace remaining � entities with unicode value, such as ' -> ' | |
| text = _replaceNumberEntities(text) | |
| # replace several named entities with character, such as — -> - | |
| # see constants.py for the list. | |
| # reverse sort will put entities with ; before the same one without, when valid. | |
| for e in reversed(sorted(entities.keys())): | |
| v = entities[e] | |
| try: | |
| text = text.replace(e, v) | |
| except UnicodeDecodeError as ex: | |
| # for the pound symbol in constants.py | |
| text = text.replace(e, v.decode('utf-8')) | |
| # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse | |
| # entities terribly well and inserts (;) after something that | |
| # it thinks might be an entity. AT&T becomes AT&T; All of my | |
| # attempts to fix this by changing the input to | |
| # BeautifulStoneSoup break something else instead. But at | |
| # this point, there should be *no* real entities left, so find | |
| # these not-entities and removing them here should be safe. | |
| text = _replaceNotEntities(text) | |
| # < < and & are the only html entities allowed in xhtml, put those back. | |
| return text.replace('&', '&').replace('&lt', '<').replace('&gt', '>') | |
| # entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent | |
| entities = { 'á' : 'á', | |
| 'Á' : 'Á', | |
| 'Á' : 'Á', | |
| 'á' : 'á', | |
| 'â' : 'â', | |
| 'Â' : 'Â', | |
| 'Â' : 'Â', | |
| 'â' : 'â', | |
| '´' : '´', | |
| '´' : '´', | |
| 'Æ' : 'Æ', | |
| 'æ' : 'æ', | |
| 'Æ' : 'Æ', | |
| 'æ' : 'æ', | |
| 'à' : 'à', | |
| 'À' : 'À', | |
| 'À' : 'À', | |
| 'à' : 'à', | |
| 'ℵ' : 'ℵ', | |
| 'α' : 'α', | |
| 'Α' : 'Α', | |
| '&' : '&', | |
| '&' : '&', | |
| '&' : '&', | |
| '&' : '&', | |
| '∧' : '∧', | |
| '∠' : '∠', | |
| 'å' : 'å', | |
| 'Å' : 'Å', | |
| 'Å' : 'Å', | |
| 'å' : 'å', | |
| '≈' : '≈', | |
| 'ã' : 'ã', | |
| 'Ã' : 'Ã', | |
| 'Ã' : 'Ã', | |
| 'ã' : 'ã', | |
| 'ä' : 'ä', | |
| 'Ä' : 'Ä', | |
| 'Ä' : 'Ä', | |
| 'ä' : 'ä', | |
| '„' : '„', | |
| 'β' : 'β', | |
| 'Β' : 'Β', | |
| '¦' : '¦', | |
| '¦' : '¦', | |
| '•' : '•', | |
| '∩' : '∩', | |
| 'ç' : 'ç', | |
| 'Ç' : 'Ç', | |
| 'Ç' : 'Ç', | |
| 'ç' : 'ç', | |
| '¸' : '¸', | |
| '¸' : '¸', | |
| '¢' : '¢', | |
| '¢' : '¢', | |
| 'χ' : 'χ', | |
| 'Χ' : 'Χ', | |
| 'ˆ' : 'ˆ', | |
| '♣' : '♣', | |
| '≅' : '≅', | |
| '©' : '©', | |
| '©' : '©', | |
| '©' : '©', | |
| '©' : '©', | |
| '↵' : '↵', | |
| '∪' : '∪', | |
| '¤' : '¤', | |
| '¤' : '¤', | |
| '†' : '†', | |
| '‡' : '‡', | |
| '↓' : '↓', | |
| '⇓' : '⇓', | |
| '°' : '°', | |
| '°' : '°', | |
| 'δ' : 'δ', | |
| 'Δ' : 'Δ', | |
| '♦' : '♦', | |
| '÷' : '÷', | |
| '÷' : '÷', | |
| 'é' : 'é', | |
| 'É' : 'É', | |
| 'É' : 'É', | |
| 'é' : 'é', | |
| 'ê' : 'ê', | |
| 'Ê' : 'Ê', | |
| 'Ê' : 'Ê', | |
| 'ê' : 'ê', | |
| 'è' : 'è', | |
| 'È' : 'È', | |
| 'È' : 'È', | |
| 'è' : 'è', | |
| '∅' : '∅', | |
| ' ' : ' ', | |
| ' ' : ' ', | |
| 'ε' : 'ε', | |
| 'Ε' : 'Ε', | |
| '≡' : '≡', | |
| 'η' : 'η', | |
| 'Η' : 'Η', | |
| 'ð' : 'ð', | |
| 'Ð' : 'Ð', | |
| 'Ð' : 'Ð', | |
| 'ð' : 'ð', | |
| 'ë' : 'ë', | |
| 'Ë' : 'Ë', | |
| 'Ë' : 'Ë', | |
| 'ë' : 'ë', | |
| '€' : '€', | |
| '∃' : '∃', | |
| 'ƒ' : 'ƒ', | |
| '∀' : '∀', | |
| '½' : '½', | |
| '½' : '½', | |
| '¼' : '¼', | |
| '¼' : '¼', | |
| '¾' : '¾', | |
| '¾' : '¾', | |
| '⁄' : '⁄', | |
| 'γ' : 'γ', | |
| 'Γ' : 'Γ', | |
| '≥' : '≥', | |
| #'>' : '>', | |
| #'>' : '>', | |
| #'>' : '>', | |
| #'>' : '>', | |
| '↔' : '↔', | |
| '⇔' : '⇔', | |
| '♥' : '♥', | |
| '…' : '…', | |
| 'í' : 'í', | |
| 'Í' : 'Í', | |
| 'Í' : 'Í', | |
| 'í' : 'í', | |
| 'î' : 'î', | |
| 'Î' : 'Î', | |
| 'Î' : 'Î', | |
| 'î' : 'î', | |
| '¡' : '¡', | |
| '¡' : '¡', | |
| 'ì' : 'ì', | |
| 'Ì' : 'Ì', | |
| 'Ì' : 'Ì', | |
| 'ì' : 'ì', | |
| 'ℑ' : 'ℑ', | |
| '∞' : '∞', | |
| '∫' : '∫', | |
| 'ι' : 'ι', | |
| 'Ι' : 'Ι', | |
| '¿' : '¿', | |
| '¿' : '¿', | |
| '∈' : '∈', | |
| 'ï' : 'ï', | |
| 'Ï' : 'Ï', | |
| 'Ï' : 'Ï', | |
| 'ï' : 'ï', | |
| 'κ' : 'κ', | |
| 'Κ' : 'Κ', | |
| 'λ' : 'λ', | |
| 'Λ' : 'Λ', | |
| '«' : '«', | |
| '«' : '«', | |
| '←' : '←', | |
| '⇐' : '⇐', | |
| '⌈' : '⌈', | |
| '“' : '“', | |
| '≤' : '≤', | |
| '⌊' : '⌊', | |
| '∗' : '∗', | |
| '◊' : '◊', | |
| '‎' : '', | |
| '‹' : '‹', | |
| '‘' : '‘', | |
| #'<' : '<', | |
| #'<' : '<', | |
| #'<' : '<', | |
| #'<' : '<', | |
| '¯' : '¯', | |
| '¯' : '¯', | |
| '—' : '—', | |
| 'µ' : 'µ', | |
| 'µ' : 'µ', | |
| '·' : '·', | |
| '·' : '·', | |
| '−' : '−', | |
| 'μ' : 'μ', | |
| 'Μ' : 'Μ', | |
| '∇' : '∇', | |
| ' ' : ' ', | |
| ' ' : ' ', | |
| '–' : '–', | |
| '≠' : '≠', | |
| '∋' : '∋', | |
| '¬' : '¬', | |
| '¬' : '¬', | |
| '∉' : '∉', | |
| '⊄' : '⊄', | |
| 'ñ' : 'ñ', | |
| 'Ñ' : 'Ñ', | |
| 'Ñ' : 'Ñ', | |
| 'ñ' : 'ñ', | |
| 'ν' : 'ν', | |
| 'Ν' : 'Ν', | |
| 'ó' : 'ó', | |
| 'Ó' : 'Ó', | |
| 'Ó' : 'Ó', | |
| 'ó' : 'ó', | |
| 'ô' : 'ô', | |
| 'Ô' : 'Ô', | |
| 'Ô' : 'Ô', | |
| 'ô' : 'ô', | |
| 'Œ' : 'Œ', | |
| 'œ' : 'œ', | |
| 'ò' : 'ò', | |
| 'Ò' : 'Ò', | |
| 'Ò' : 'Ò', | |
| 'ò' : 'ò', | |
| '‾' : '‾', | |
| 'ω' : 'ω', | |
| 'Ω' : 'Ω', | |
| 'ο' : 'ο', | |
| 'Ο' : 'Ο', | |
| '⊕' : '⊕', | |
| '∨' : '∨', | |
| 'ª' : 'ª', | |
| 'ª' : 'ª', | |
| 'º' : 'º', | |
| 'º' : 'º', | |
| 'ø' : 'ø', | |
| 'Ø' : 'Ø', | |
| 'Ø' : 'Ø', | |
| 'ø' : 'ø', | |
| 'õ' : 'õ', | |
| 'Õ' : 'Õ', | |
| 'Õ' : 'Õ', | |
| 'õ' : 'õ', | |
| '⊗' : '⊗', | |
| 'ö' : 'ö', | |
| 'Ö' : 'Ö', | |
| 'Ö' : 'Ö', | |
| 'ö' : 'ö', | |
| '¶' : '¶', | |
| '¶' : '¶', | |
| '∂' : '∂', | |
| '‰' : '‰', | |
| '⊥' : '⊥', | |
| 'φ' : 'φ', | |
| 'Φ' : 'Φ', | |
| 'π' : 'π', | |
| 'Π' : 'Π', | |
| 'ϖ' : 'ϖ', | |
| '±' : '±', | |
| '±' : '±', | |
| '£' : '£', | |
| '£' : '£', | |
| '′' : '′', | |
| '″' : '″', | |
| '∏' : '∏', | |
| '∝' : '∝', | |
| 'ψ' : 'ψ', | |
| 'Ψ' : 'Ψ', | |
| '"' : '"', | |
| '"' : '"', | |
| '"' : '"', | |
| '"' : '"', | |
| '√' : '√', | |
| '»' : '»', | |
| '»' : '»', | |
| '→' : '→', | |
| '⇒' : '⇒', | |
| '⌉' : '⌉', | |
| '”' : '”', | |
| 'ℜ' : 'ℜ', | |
| '®' : '®', | |
| '®' : '®', | |
| '®' : '®', | |
| '®' : '®', | |
| '⌋' : '⌋', | |
| 'ρ' : 'ρ', | |
| 'Ρ' : 'Ρ', | |
| '‏' : '', | |
| '›' : '›', | |
| '’' : '’', | |
| '‚' : '‚', | |
| 'š' : 'š', | |
| 'Š' : 'Š', | |
| '⋅' : '⋅', | |
| '§' : '§', | |
| '§' : '§', | |
| '­' : '', # strange optional hyphenation control character, not just a dash | |
| '­' : '', | |
| 'σ' : 'σ', | |
| 'Σ' : 'Σ', | |
| 'ς' : 'ς', | |
| '∼' : '∼', | |
| '♠' : '♠', | |
| '⊂' : '⊂', | |
| '⊆' : '⊆', | |
| '∑' : '∑', | |
| '¹' : '¹', | |
| '¹' : '¹', | |
| '²' : '²', | |
| '²' : '²', | |
| '³' : '³', | |
| '³' : '³', | |
| '⊃' : '⊃', | |
| '⊇' : '⊇', | |
| 'ß' : 'ß', | |
| 'ß' : 'ß', | |
| 'τ' : 'τ', | |
| 'Τ' : 'Τ', | |
| '∴' : '∴', | |
| 'θ' : 'θ', | |
| 'Θ' : 'Θ', | |
| 'ϑ' : 'ϑ', | |
| ' ' : ' ', | |
| 'þ' : 'þ', | |
| 'Þ' : 'Þ', | |
| 'Þ' : 'Þ', | |
| 'þ' : 'þ', | |
| '˜' : '˜', | |
| '×' : '×', | |
| '×' : '×', | |
| '™' : '™', | |
| 'ú' : 'ú', | |
| 'Ú' : 'Ú', | |
| 'Ú' : 'Ú', | |
| 'ú' : 'ú', | |
| '↑' : '↑', | |
| '⇑' : '⇑', | |
| 'û' : 'û', | |
| 'Û' : 'Û', | |
| 'Û' : 'Û', | |
| 'û' : 'û', | |
| 'ù' : 'ù', | |
| 'Ù' : 'Ù', | |
| 'Ù' : 'Ù', | |
| 'ù' : 'ù', | |
| '¨' : '¨', | |
| '¨' : '¨', | |
| 'ϒ' : 'ϒ', | |
| 'υ' : 'υ', | |
| 'Υ' : 'Υ', | |
| 'ü' : 'ü', | |
| 'Ü' : 'Ü', | |
| 'Ü' : 'Ü', | |
| 'ü' : 'ü', | |
| '℘' : '℘', | |
| 'ξ' : 'ξ', | |
| 'Ξ' : 'Ξ', | |
| 'ý' : 'ý', | |
| 'Ý' : 'Ý', | |
| 'Ý' : 'Ý', | |
| 'ý' : 'ý', | |
| '¥' : '¥', | |
| '¥' : '¥', | |
| 'ÿ' : 'ÿ', | |
| 'Ÿ' : 'Ÿ', | |
| 'ÿ' : 'ÿ', | |
| 'ζ' : 'ζ', | |
| 'Ζ' : 'Ζ', | |
| '‍' : '', # strange spacing control character, not just a space | |
| '‌' : '', # strange spacing control character, not just a space | |
| } | |
| class SplitEpub: | |
| def __init__(self, inputio): | |
| self.epub = ZipFile(inputio, 'r') | |
| self.content_dom = None | |
| self.content_relpath = None | |
| self.manifest_items = None | |
| self.guide_items = None | |
| self.toc_dom = None | |
| self.toc_relpath = None | |
| self.toc_map = None | |
| self.split_lines = None | |
| self.origauthors = [] | |
| self.origtitle = None | |
| def get_file(self,href): | |
| return self.epub.read(href) | |
| def get_content_dom(self): | |
| if not self.content_dom: | |
| ## Find the .opf file. | |
| container = self.epub.read("META-INF/container.xml") | |
| containerdom = parseString(container) | |
| rootfilenodelist = containerdom.getElementsByTagName("rootfile") | |
| rootfilename = rootfilenodelist[0].getAttribute("full-path") | |
| self.content_dom = parseString(self.epub.read(rootfilename)) | |
| self.content_relpath = get_path_part(rootfilename) | |
| return self.content_dom | |
| def get_content_relpath(self): | |
| ## Save the path to the .opf file--hrefs inside it are relative to it. | |
| if not self.content_relpath: | |
| self.get_content_dom() # sets self.content_relpath also. | |
| return self.content_relpath | |
| def get_toc_relpath(self): | |
| ## Save the path to the toc.ncx file--hrefs inside it are relative to it. | |
| if not self.toc_relpath: | |
| self.get_manifest_items() # sets self.toc_relpath also. | |
| return self.toc_relpath | |
| def get_manifest_items(self): | |
| if not self.manifest_items: | |
| self.manifest_items = {} | |
| for item in self.get_content_dom().getElementsByTagName("item"): | |
| fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href"))) | |
| #print("---- item fullhref:%s"%(fullhref)) | |
| self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type")) | |
| self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type")) | |
| if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ): | |
| # TOC file is only one with this type--as far as I know. | |
| self.toc_relpath = get_path_part(fullhref) | |
| self.toc_dom = parseString(self.epub.read(fullhref)) | |
| return self.manifest_items | |
| def get_guide_items(self): | |
| if not self.guide_items: | |
| self.guide_items = {} | |
| for item in self.get_content_dom().getElementsByTagName("reference"): | |
| fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href"))) | |
| self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title")) | |
| #print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],)) | |
| #self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type")) | |
| return self.guide_items | |
| def get_toc_dom(self): | |
| if not self.toc_dom: | |
| self.get_manifest_items() # also sets self.toc_dom | |
| return self.toc_dom | |
| # dict() of href->[(text,anchor),...],... | |
| # eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],... | |
| def get_toc_map(self): | |
| if not self.toc_map: | |
| self.toc_map = {} | |
| # update all navpoint ids with bookid for uniqueness. | |
| for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"): | |
| src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src"))) | |
| if '#' in src: | |
| (href,anchor)=src.split("#") | |
| else: | |
| (href,anchor)=(src,None) | |
| # The first of these in each navPoint should be the appropriate one. | |
| # (may be others due to nesting. | |
| try: | |
| text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data) | |
| except: | |
| #print("No chapter title found in TOC for (%s)"%src) | |
| text = "" | |
| if href not in self.toc_map: | |
| self.toc_map[href] = [] | |
| if anchor == None: | |
| # put file links ahead of ancher links. Otherwise | |
| # a non-linear anchor link may take precedence, | |
| # which will confuse EpubSplit. This will cause | |
| # split lines to possibly be out of order from | |
| # TOC, but the alternative is worse. Should be a | |
| # rare corner case. | |
| ## Keep order of non-anchor entries to the same file. | |
| idx=0 | |
| while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor | |
| # print(idx) | |
| # print(self.toc_map[href][idx]) | |
| idx = idx+1 | |
| self.toc_map[href].insert(idx,(text,anchor)) | |
| else: | |
| self.toc_map[href].append((text,anchor)) | |
| # print(self.toc_map) | |
| return self.toc_map | |
| # list of dicts with href, anchor & toc text. | |
| # 'split lines' are all the points that the epub can be split on. | |
| # Offer a split at each spine file and each ToC point. | |
| def get_split_lines(self): | |
| metadom = self.get_content_dom() | |
| ## Save indiv book title | |
| try: | |
| self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data | |
| except: | |
| self.origtitle = "(Title Missing)" | |
| ## Save authors. | |
| for creator in metadom.getElementsByTagName("dc:creator"): | |
| try: | |
| if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): | |
| if creator.firstChild.data not in self.origauthors: | |
| self.origauthors.append(creator.firstChild.data) | |
| except: | |
| pass | |
| if len(self.origauthors) == 0: | |
| self.origauthors.append("(Authors Missing)") | |
| self.split_lines = [] # list of dicts with href, anchor and toc | |
| # spin on spine files. | |
| count=0 | |
| for itemref in metadom.getElementsByTagName("itemref"): | |
| idref = itemref.getAttribute("idref") | |
| (href,type) = self.get_manifest_items()["i:"+idref] | |
| current = {} | |
| self.split_lines.append(current) | |
| current['href']=href | |
| current['anchor']=None | |
| current['toc'] = [] | |
| if href in self.get_guide_items(): | |
| current['guide'] = self.get_guide_items()[href] | |
| current['id'] = idref | |
| current['type'] = type | |
| current['num'] = count | |
| t=self.epub.read(href).decode('utf-8') | |
| if len(t) > 1500 : t = t[:1500] + "..." | |
| current['sample']=t | |
| count += 1 | |
| #print("spine:%s->%s"%(idref,href)) | |
| # if href is in the toc. | |
| if href in self.get_toc_map(): | |
| # For each toc entry, check to see if there's an anchor, if so, | |
| # make a new split line. | |
| for tocitem in self.get_toc_map()[href]: | |
| (text,anchor) = tocitem | |
| # XXX for outputing to screen in CLI--hopefully won't need in plugin? | |
| try: | |
| text = "%s"%text | |
| except: | |
| text = "(error text)" | |
| if anchor: | |
| #print("breakpoint: %d"%count) | |
| current = {} | |
| self.split_lines.append(current) | |
| current['href']=href | |
| current['anchor']=anchor | |
| current['toc']=[] | |
| current['id'] = idref | |
| current['type'] = type | |
| current['num'] = count | |
| # anchor, need to split first, then reduce to 1500. | |
| t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False) | |
| if len(t) > 1500 : t = t[:1500] + "..." | |
| current['sample']=t | |
| count += 1 | |
| # There can be more than one toc to the same split line. | |
| # This won't find multiple toc to the same anchor yet. | |
| current['toc'].append(text) | |
| #print("\ttoc:'%s' %s#%s"%(text,href,anchor)) | |
| return self.split_lines | |
| # pass in list of line numbers(?) | |
| def get_split_files(self,linenums): | |
| self.filecache = FileCache(self.get_manifest_items()) | |
| # set include flag in split_lines. | |
| if not self.split_lines: | |
| self.get_split_lines() | |
| lines = self.split_lines | |
| lines_set = set([int(k) for k in linenums]) | |
| for j in range(len(lines)): | |
| lines[j]['include'] = j in lines_set | |
| # loop through finding 'chunks' -- contiguous pieces in the | |
| # same file. Each included file is at least one chunk, but if | |
| # parts are left out, one original file can end up being more | |
| # than one chunk. | |
| outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive. | |
| inchunk = False | |
| currentfile = None | |
| start = None | |
| for line in lines: | |
| if line['include']: | |
| if not inchunk: # start new chunk | |
| inchunk = True | |
| currentfile = line['href'] | |
| start = line | |
| else: # inchunk | |
| # different file, new chunk. | |
| if currentfile != line['href']: | |
| outchunks.append((currentfile,start,line)) | |
| inchunk=True | |
| currentfile=line['href'] | |
| start=line | |
| else: # not include | |
| if inchunk: # save previous chunk. | |
| outchunks.append((currentfile,start,line)) | |
| inchunk=False | |
| # final chunk for when last in list is include. | |
| if inchunk: | |
| outchunks.append((currentfile,start,None)) | |
| outfiles=[] # tuples, (filename,type,data) -- filename changed to unique | |
| for (href,start,end) in outchunks: | |
| filedata = self.epub.read(href).decode('utf-8') | |
| # discard before start if anchor. | |
| if start['anchor'] != None: | |
| filedata = splitHtml(filedata,start['anchor'],before=False) | |
| # discard from end anchor on(inclusive), but only if same file. If | |
| # different file, keep rest of file. If no 'end', then it was the | |
| # last chunk and went to the end of the last file. | |
| if end != None and end['anchor'] != None and end['href']==href: | |
| filedata = splitHtml(filedata,end['anchor'],before=True) | |
| filename = self.filecache.add_content_file(href,filedata) | |
| outfiles.append([filename,start['id'],start['type'],filedata]) | |
| # print("self.oldnew:%s"%self.filecache.oldnew) | |
| # print("self.newold:%s"%self.filecache.newold) | |
| # print("\nanchors:%s\n"%self.filecache.anchors) | |
| # print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles) | |
| # print("relpath:%s"%get_path_part()) | |
| # Spin through to replace internal URLs | |
| for fl in outfiles: | |
| #print("file:%s"%fl[0]) | |
| soup = BeautifulSoup(fl[3],'html5lib') | |
| changed = False | |
| for a in soup.findAll('a'): | |
| if a.has_attr('href'): | |
| path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href']))) | |
| #print("full a['href']:%s"%path) | |
| if path in self.filecache.anchors and self.filecache.anchors[path] != path: | |
| a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):] | |
| #print("replacement path:%s"%a['href']) | |
| changed = True | |
| if changed: | |
| fl[3] = unicode(soup) | |
| return outfiles | |
| def write_split_epub(self, | |
| outputio, | |
| linenums, | |
| changedtocs={}, | |
| authoropts=[], | |
| titleopt=None, | |
| descopt=None, | |
| tags=[], | |
| languages=['en'], | |
| coverjpgpath=None): | |
| files = self.get_split_files(linenums) | |
| ## Write mimetype file, must be first and uncompressed. | |
| ## Older versions of python(2.4/5) don't allow you to specify | |
| ## compression by individual file. | |
| ## Overwrite if existing output file. | |
| outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) | |
| outputepub.debug = 3 | |
| outputepub.writestr("mimetype", "application/epub+zip") | |
| outputepub.close() | |
| ## Re-open file for content. | |
| outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) | |
| outputepub.debug = 3 | |
| ## Create META-INF/container.xml file. The only thing it does is | |
| ## point to content.opf | |
| containerdom = getDOMImplementation().createDocument(None, "container", None) | |
| containertop = containerdom.documentElement | |
| containertop.setAttribute("version","1.0") | |
| containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") | |
| rootfiles = containerdom.createElement("rootfiles") | |
| containertop.appendChild(rootfiles) | |
| rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", | |
| "media-type":"application/oebps-package+xml"})) | |
| outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) | |
| #### ## create content.opf file. | |
| uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme. | |
| contentdom = getDOMImplementation().createDocument(None, "package", None) | |
| package = contentdom.documentElement | |
| package.setAttribute("version","2.0") | |
| package.setAttribute("xmlns","http://www.idpf.org/2007/opf") | |
| package.setAttribute("unique-identifier","epubsplit-id") | |
| metadata=newTag(contentdom,"metadata", | |
| attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/", | |
| "xmlns:opf":"http://www.idpf.org/2007/opf"}) | |
| metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"})) | |
| if( titleopt is None ): | |
| titleopt = self.origtitle+" Split" | |
| metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt)) | |
| if( authoropts and len(authoropts) > 0 ): | |
| useauthors=authoropts | |
| else: | |
| useauthors=self.origauthors | |
| usedauthors=dict() | |
| for author in useauthors: | |
| if( author not in usedauthors ): | |
| usedauthors[author]=author | |
| metadata.appendChild(newTag(contentdom,"dc:creator", | |
| attrs={"opf:role":"aut"}, | |
| text=author)) | |
| metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"})) | |
| metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories")) | |
| if languages: | |
| for l in languages: | |
| metadata.appendChild(newTag(contentdom,"dc:language",text=l)) | |
| else: | |
| metadata.appendChild(newTag(contentdom,"dc:language",text="en")) | |
| if not descopt: | |
| # created now, but not filled in until TOC generation to save loops. | |
| description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors))) | |
| else: | |
| description = newTag(contentdom,"dc:description",text=descopt) | |
| metadata.appendChild(description) | |
| for tag in tags: | |
| metadata.appendChild(newTag(contentdom,"dc:subject",text=tag)) | |
| package.appendChild(metadata) | |
| manifest = contentdom.createElement("manifest") | |
| package.appendChild(manifest) | |
| spine = newTag(contentdom,"spine",attrs={"toc":"ncx"}) | |
| package.appendChild(spine) | |
| manifest.appendChild(newTag(contentdom,"item", | |
| attrs={'id':'ncx', | |
| 'href':'toc.ncx', | |
| 'media-type':'application/x-dtbncx+xml'})) | |
| if coverjpgpath: | |
| # <meta name="cover" content="cover.jpg"/> | |
| metadata.appendChild(newTag(contentdom,"meta",{"name":"cover", | |
| "content":"coverimageid"})) | |
| # cover stuff for later: | |
| # at end of <package>: | |
| # <guide> | |
| # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> | |
| # </guide> | |
| guide = newTag(contentdom,"guide") | |
| guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", | |
| "title":"Cover", | |
| "href":"cover.xhtml"})) | |
| package.appendChild(guide) | |
| manifest.appendChild(newTag(contentdom,"item", | |
| attrs={'id':"coverimageid", | |
| 'href':"cover.jpg", | |
| 'media-type':"image/jpeg"})) | |
| # Note that the id of the cover xhmtl *must* be 'cover' | |
| # for it to work on Nook. | |
| manifest.appendChild(newTag(contentdom,"item", | |
| attrs={'id':"cover", | |
| 'href':"cover.xhtml", | |
| 'media-type':"application/xhtml+xml"})) | |
| spine.appendChild(newTag(contentdom,"itemref", | |
| attrs={"idref":"cover", | |
| "linear":"yes"})) | |
| contentcount=0 | |
| for (filename,id,type,filedata) in files: | |
| #filename = self.filecache.addHtml(href,filedata) | |
| #print("writing :%s"%filename) | |
| # add to manifest and spine | |
| if coverjpgpath and filename == "cover.xhtml": | |
| continue # don't dup cover. | |
| outputepub.writestr(filename,filedata.encode('utf-8')) | |
| id = "a%d"%contentcount | |
| contentcount += 1 | |
| manifest.appendChild(newTag(contentdom,"item", | |
| attrs={'id':id, | |
| 'href':filename, | |
| 'media-type':type})) | |
| spine.appendChild(newTag(contentdom,"itemref", | |
| attrs={"idref":id, | |
| "linear":"yes"})) | |
| fontdecrypter = FontDecrypter(self.epub,self.get_content_dom()) | |
| linked='' | |
| for (linked,type) in self.filecache.linkedfiles: | |
| # print("linked files:(%s,%s)"%(linked,type)) | |
| # add to manifest | |
| if coverjpgpath and linked == "cover.jpg": | |
| continue # don't dup cover. | |
| try: | |
| linkeddata = self.get_file(linked) | |
| if linked in fontdecrypter.get_encrypted_fontfiles(): | |
| print("Decrypting font file: %s"%linked) | |
| linkeddata = fontdecrypter.get_decrypted_font_data(linked) | |
| outputepub.writestr(linked,linkeddata) | |
| except Exception as e: | |
| print("Skipping linked file (%s)\nException: %s"%(linked,e)) | |
| id = "a%d"%contentcount | |
| contentcount += 1 | |
| manifest.appendChild(newTag(contentdom,"item", | |
| attrs={'id':id, | |
| 'href':linked, | |
| 'media-type':type})) | |
| contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8' | |
| # tweak for brain damaged Nook STR. Nook insists on name before content. | |
| contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>', | |
| '<meta name="cover" content="coverimageid"/>') | |
| outputepub.writestr("content.opf",contentxml) | |
| ## create toc.ncx file | |
| tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) | |
| ncx = tocncxdom.documentElement | |
| ncx.setAttribute("version","2005-1") | |
| ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/") | |
| head = tocncxdom.createElement("head") | |
| ncx.appendChild(head) | |
| head.appendChild(newTag(tocncxdom,"meta", | |
| attrs={"name":"dtb:uid", "content":uniqueid})) | |
| depthnode = newTag(tocncxdom,"meta", | |
| attrs={"name":"dtb:depth", "content":"1"}) | |
| head.appendChild(depthnode) | |
| head.appendChild(newTag(tocncxdom,"meta", | |
| attrs={"name":"dtb:totalPageCount", "content":"0"})) | |
| head.appendChild(newTag(tocncxdom,"meta", | |
| attrs={"name":"dtb:maxPageNumber", "content":"0"})) | |
| docTitle = tocncxdom.createElement("docTitle") | |
| docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt))) | |
| ncx.appendChild(docTitle) | |
| tocnavMap = tocncxdom.createElement("navMap") | |
| ncx.appendChild(tocnavMap) | |
| # come back to lines again for TOC because files only has files(gasp-shock!) | |
| count=1 | |
| for line in self.split_lines: | |
| if line['include']: | |
| # if changed, use only changed values. | |
| if line['num'] in changedtocs: | |
| line['toc'] = changedtocs[line['num']] | |
| # can have more than one toc entry. | |
| for title in line['toc']: | |
| newnav = newTag(tocncxdom,"navPoint", | |
| {"id":"a%03d"%count,"playOrder":"%d" % count}) | |
| count += 1 | |
| tocnavMap.appendChild(newnav) | |
| navlabel = newTag(tocncxdom,"navLabel") | |
| newnav.appendChild(navlabel) | |
| # For purposes of TOC titling & desc, use first book author | |
| navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title))) | |
| # Find the first 'spine' item's content for the title navpoint. | |
| # Many epubs have the first chapter as first navpoint, so we can't just | |
| # copy that anymore. | |
| if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors: | |
| src = self.filecache.anchors[line['href']+"#"+line['anchor']] | |
| #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src)) | |
| else: | |
| #print("toc from href(%s)"%line['href']) | |
| src = line['href'] | |
| newnav.appendChild(newTag(tocncxdom,"content", | |
| {"src":src})) | |
| outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8')) | |
| if coverjpgpath: | |
| # write, not write string. Pulling from file. | |
| outputepub.write(coverjpgpath,"cover.jpg") | |
| outputepub.writestr("cover.xhtml",''' | |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css"> | |
| @page {padding: 0pt; margin:0pt} | |
| body { text-align: center; padding:0pt; margin: 0pt; } | |
| div { margin: 0pt; padding: 0pt; } | |
| </style></head><body><div> | |
| <img src="cover.jpg" alt="cover"/> | |
| </div></body></html> | |
| ''') | |
| # declares all the files created by Windows. otherwise, when | |
| # it runs in appengine, windows unzips the files as 000 perms. | |
| for zf in outputepub.filelist: | |
| zf.create_system = 0 | |
| outputepub.close() | |
| class FileCache: | |
| def __init__(self,manifest_items={}): | |
| self.manifest_items = manifest_items | |
| self.oldnew = {} | |
| self.newold = {} | |
| self.anchors = {} | |
| self.linkedfiles = set() | |
| ## always include font files for embedded fonts | |
| for key, value in six.iteritems(self.manifest_items): | |
| # print("manifest:%s %s"%(key,value)) | |
| if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype', | |
| 'application/x-font-ttf', | |
| 'application/x-font-truetype', | |
| 'application/font-sfnt'): | |
| self.add_linked_file(value[0]) | |
| def add_linked_file(self, href): | |
| href = normpath(unquote(href)) # fix %20 & /../ | |
| if ("h:"+href) in self.manifest_items: | |
| type = self.manifest_items["h:"+href][1] | |
| else: | |
| type = 'unknown' | |
| self.linkedfiles.add((href,type)) | |
| def add_content_file(self, href, filedata): | |
| changedname = False | |
| if href not in self.oldnew: | |
| self.oldnew[href]=[] | |
| newfile = href | |
| else: | |
| changedname = True | |
| newfile = "%s%d-%s"%(get_path_part(href), | |
| len(self.oldnew[href]), | |
| get_file_part(href)) | |
| self.oldnew[href].append(newfile) | |
| self.newold[newfile]=href | |
| #print("newfile:%s"%newfile) | |
| soup = BeautifulSoup(filedata,'html5lib') | |
| #print("soup head:%s"%soup.find('head')) | |
| # same name? Don't need to worry about changing links to anchors | |
| for a in soup.findAll(): # not just 'a', any tag. | |
| #print("a:%s"%a) | |
| if a.has_attr('id'): | |
| self.anchors[href+'#'+a['id']]=newfile+'#'+a['id'] | |
| # <image> from baen epub. | |
| # <image width="462" height="616" xlink:href="cover.jpeg"/> | |
| for img in soup.findAll('img') + soup.findAll('image'): | |
| src = None | |
| if img.has_attr('src'): | |
| src=img['src'] | |
| if img.has_attr('xlink:href'): | |
| src=img['xlink:href'] | |
| if src: | |
| self.add_linked_file(get_path_part(href)+src) | |
| else: | |
| logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img)) | |
| # link href="0.css" type="text/css" | |
| for style in soup.findAll('link',{'type':'text/css'}): | |
| #print("link:%s"%style) | |
| if style.has_attr('href'): | |
| self.add_linked_file(get_path_part(href)+style['href']) | |
| return newfile | |
| def splitHtml(data,tagid,before=False): | |
| soup = BeautifulSoup(data,'lxml') | |
| #print("splitHtml.soup head:%s"%soup.find('head')) | |
| splitpoint = soup.find(id=tagid) | |
| #print("splitpoint:%s"%splitpoint) | |
| if splitpoint == None: | |
| return data | |
| if before: | |
| # remove all next siblings. | |
| for n in splitpoint.findNextSiblings(): | |
| n.extract() | |
| parent = splitpoint.parent | |
| while parent and parent.name != 'body': | |
| for n in parent.findNextSiblings(): | |
| n.extract() | |
| parent = parent.parent | |
| splitpoint.extract() | |
| else: | |
| # remove all prev siblings. | |
| for n in splitpoint.findPreviousSiblings(): | |
| n.extract() | |
| parent = splitpoint.parent | |
| while parent and parent.name != 'body': | |
| for n in parent.findPreviousSiblings(): | |
| n.extract() | |
| parent = parent.parent | |
| return re.sub(r'( *\r?\n)+','\r\n',unicode(soup)) | |
| def get_path_part(n): | |
| relpath = os.path.dirname(n) | |
| if( len(relpath) > 0 ): | |
| relpath=relpath+"/" | |
| return relpath | |
| def get_file_part(n): | |
| return os.path.basename(n) | |
| ## Utility method for creating new tags. | |
| def newTag(dom,name,attrs=None,text=None): | |
| tag = dom.createElement(name) | |
| if( attrs is not None ): | |
| for attr in attrs.keys(): | |
| tag.setAttribute(attr,attrs[attr]) | |
| if( text is not None ): | |
| tag.appendChild(dom.createTextNode(text)) | |
| return tag | |
| def main(argv,usage=None): | |
| from optparse import OptionParser | |
| if not usage: | |
| # read in args, anything starting with -- will be treated as --<varible>=<value> | |
| usage = 'usage: python %prog' | |
| parser = OptionParser(usage+''' [options] <input epub> [line numbers...] | |
| Giving an epub without line numbers will return a list of line numbers: the | |
| possible split points in the input file. Calling with line numbers will | |
| generate an epub with each of the "lines" given included.''') | |
| parser.add_option("-o", "--output", dest="outputopt", default="split.epub", | |
| help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT") | |
| parser.add_option("--output-dir", dest="outputdiropt", | |
| help="Set OUTPUT directory, Default: presend working directory") | |
| parser.add_option('--split-by-section', | |
| action='store_true', dest='split_by_section', | |
| help='Create a new epub from each of the listed line sections instead of one containing all. Splits all sections if no lines numbers are given. Each split will be named <number>-<output name> and placed in the output-dir. Sections without a Table of Contents entry will be included with the preceding section(s)', ) | |
| parser.add_option("-t", "--title", dest="titleopt", default=None, | |
| help="Use TITLE as the metadata title. Default: '<original epub title> Split' or ToC entry with --split-by-section", metavar="TITLE") | |
| parser.add_option("-d", "--description", dest="descopt", default=None, | |
| help="Use DESC as the metadata description. Default: 'Split from <epub title> by <author>'.", metavar="DESC") | |
| parser.add_option("-a", "--author", | |
| action="append", dest="authoropts", default=[], | |
| help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from original epub>", metavar="AUTHOR") | |
| parser.add_option("-g", "--tag", | |
| action="append", dest="tagopts", default=[], | |
| help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG") | |
| parser.add_option("-l", "--language", | |
| action="append", dest="languageopts", default=[], | |
| help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG") | |
| parser.add_option("-c", "--cover", dest="coveropt", default=None, | |
| help="Path to a jpg to use as cover image.", metavar="COVER") | |
| (options, args) = parser.parse_args(argv) | |
| ## Add .epub if not already there. | |
| if not options.outputopt.lower().endswith(".epub"): | |
| options.outputopt=options.outputopt+".epub" | |
| if not options.languageopts: | |
| options.languageopts = ['en'] | |
| if not args: | |
| parser.print_help() | |
| return | |
| epubO = SplitEpub(args[0]) | |
| lines = epubO.get_split_lines() | |
| if options.split_by_section: | |
| if len(args) > 1: | |
| section_lines = args[1:] | |
| else: | |
| section_lines = range(len(lines)) | |
| splitslist = [] | |
| sectionlist = [] | |
| title=None | |
| for lineno in section_lines: | |
| toclist = lines[int(lineno)]['toc'] | |
| if sectionlist and not toclist: | |
| sectionlist.append(lineno) | |
| else: | |
| ## take title from (first) ToC if available, else titleopt (_ Split internally if None) | |
| title = (toclist[0] if toclist else options.titleopt) | |
| print("title: %s"%title) | |
| sectionlist=[lineno] | |
| splitslist.append((sectionlist,title)) | |
| if sectionlist: | |
| splitslist.append((sectionlist,title)) | |
| # print(splitslist) | |
| filecount = 1 | |
| for sectionlist, title in splitslist: | |
| outputfile = "%0.4d-%s"%(filecount,options.outputopt) | |
| if options.outputdiropt: | |
| outputfile = os.path.join(options.outputdiropt,outputfile) | |
| print("output file: "+outputfile) | |
| epubO.write_split_epub(outputfile, | |
| sectionlist, | |
| authoropts=options.authoropts, | |
| titleopt=title, | |
| descopt=options.descopt, | |
| tags=options.tagopts, | |
| languages=options.languageopts, | |
| coverjpgpath=options.coveropt) | |
| filecount+=1 | |
| return | |
| elif len(args) == 1: | |
| count = 0 | |
| showlist=['toc','guide','anchor','id','href'] | |
| for line in lines: | |
| print("\nLine Number: %d"%count) | |
| for s in showlist: | |
| if s in line and line[s]: | |
| print("\t%s: %s"%(s,line[s])) | |
| count += 1 | |
| return | |
| if len(args) > 1: | |
| outputfile = options.outputopt | |
| if options.outputdiropt: | |
| outputfile = os.path.join(options.outputdiropt,outputfile) | |
| print("output file: "+outputfile) | |
| epubO.write_split_epub(outputfile, | |
| args[1:], | |
| authoropts=options.authoropts, | |
| titleopt=options.titleopt, | |
| descopt=options.descopt, | |
| tags=options.tagopts, | |
| languages=options.languageopts, | |
| coverjpgpath=options.coveropt) | |
| return | |
| if __name__ == "__main__": | |
| main(sys.argv[1:]) | |