Spaces:

Awell00
/

book_summarizer

Paused

App Files Files Community

Awell00 commited on Jan 3, 2025

Commit

feb2065

verified ·

1 Parent(s): 2f407ca

feat!: add all files

Browse files

Files changed (4) hide show

EpubSplit.zip +3 -0
app.py +248 -0
epubsplit.py +1367 -0
requirements.txt +14 -0

EpubSplit.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7e0dcc5fe502500fde5108298f7074c9979a27ac4452734b15e80c046b65f75e
+size 484014

app.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import gradio as gr
+import warnings
+import requests
+from bs4 import BeautifulSoup
+import subprocess
+import io
+import ebooklib
+from ebooklib import epub
+from huggingface_hub import InferenceClient
+from epubsplit import SplitEpub
+import re
+import os
+def install_calibre():
+    try:
+        subprocess.run(["apt-get", "update"], check=True)
+        subprocess.run(["apt-get", "install", "-y", "calibre"], check=True),
+        subprocess.run(["calibre-customize", "-a", "EpubSplit.zip"], check=True)
+        print("Calibre installed successfully.")
+    except subprocess.CalledProcessError as e:
+        print(f"Error installing calibre: {e}")
+install_calibre()
+# Suppress specific warnings
+warnings.filterwarnings("ignore", message="In the future version we will turn default option ignore_ncx to True.")
+warnings.filterwarnings("ignore", message="This search incorrectly ignores the root element, and will be fixed in a future version.")
+# Constants
+EPUB_PATH = 'book.epub'
+OUTPUT_EPUB_PATH = 'output.epub'
+OUTPUT_PDF_PATH = 'output.pdf'
+LIBRARY_URL = os.getenv("LIBRARY_URL")
+COOKIE_CONFIG = {
+    'remix_userkey': os.getenv("LIBRARY_KEY"),
+    'remix_userid': '14009766',
+    'selectedSiteMode': 'books',
+    'domainsNotWorking': os.getenv("NOT_WORKING")
+}
+def fetch_library_search_url():
+    try:
+        response = requests.get(LIBRARY_URL)
+        soup = BeautifulSoup(response.content, 'html5lib')
+        library_div = soup.find('div', attrs={'class': 'plainlist'})
+        if library_div:
+            links = library_div.find_all('a', class_='external text')
+            return next((link.get('href') for link in links if link.get('href').startswith('https')), "")
+    except Exception as e:
+        print(f"Error fetching library URL: {e}")
+    return ""
+SEARCH_URL = fetch_library_search_url()
+def fetch_book_details(isbn):
+    if not SEARCH_URL:
+        print("Search URL not available.")
+        return
+    search_endpoint = f"{SEARCH_URL}/s/{isbn}"
+    try:
+        response = requests.get(search_endpoint)
+        soup = BeautifulSoup(response.content, 'html5lib')
+        bookcards = soup.find_all('z-bookcard')
+        book_url = next((SEARCH_URL + card.get('href') for card in bookcards if card.get('href')), None)
+        if not book_url:
+            print("No book URL found.")
+            return
+        download_book(book_url)
+    except Exception as e:
+        print(f"Error fetching book details: {e}")
+def download_book(book_url):
+    try:
+        response = requests.get(book_url, cookies=COOKIE_CONFIG)
+        soup = BeautifulSoup(response.content, 'html5lib')
+        download_link = soup.find('a', class_='addDownloadedBook')
+        if download_link and download_link.has_attr('href'):
+            download_url = SEARCH_URL + download_link['href']
+            download_and_convert_epub(download_url)
+        else:
+            print("Download link not found or invalid.")
+    except Exception as e:
+        print(f"Error downloading book: {e}")
+def download_and_convert_epub(download_url):
+    try:
+        response = requests.get(download_url, cookies=COOKIE_CONFIG)
+        if response.status_code == 200:
+            with open(EPUB_PATH, 'wb') as epub_file:
+                epub_file.write(response.content)
+            print("EPUB downloaded successfully.")
+        else:
+            print(f"Failed to download EPUB. Status code: {response.status_code}")
+    except Exception as e:
+        print(f"Error downloading EPUB: {e}")
+def extract_chapter_text(input_epub_path, chapter_indices):
+    print(f"Extracting chapter text for indices: {chapter_indices}")
+    try:
+        with open(input_epub_path, 'rb') as epub_file:
+            split_epub = SplitEpub(epub_file)
+            output_io = io.BytesIO()
+            split_epub.write_split_epub(output_io, chapter_indices)
+            with open(OUTPUT_EPUB_PATH, 'wb') as output_file:
+                output_file.write(output_io.getvalue())
+        return read_text_from_epub(OUTPUT_EPUB_PATH)
+    except Exception as e:
+        print(f"Error extracting chapter text: {e}")
+        return ""
+def read_text_from_epub(output_epub_path):
+    try:
+        book = epub.read_epub(output_epub_path)
+        text_content = []
+        for item in book.get_items():
+            if item.get_type() == ebooklib.ITEM_DOCUMENT:
+                soup = BeautifulSoup(item.get_body_content(), 'html.parser')
+                paragraphs = soup.find_all('p')
+                text_content.extend(para.get_text() for para in paragraphs)
+        return '\n'.join(text_content)
+    except Exception as e:
+        print(f"Error reading text from EPUB: {e}")
+        return ""
+def generate_table_of_contents():
+    try:
+        result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
+        pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
+        print(result)
+        return {int(line_number): title for line_number, title in pattern.findall(result.stdout)}
+    except Exception as e:
+        print(f"Error generating table of contents: {e}")
+        return {}
+def summarize_chapter(chapter_index):
+    if chapter_index < 0:
+        return "Invalid chapter selection."
+    result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
+    pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
+    chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
+    chapter_text = ""
+    for i in range(chapter_index, chapter[chapter.index(chapter_index)+1]):
+        chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
+        if chapter_to_summarize and len(chapter_to_summarize) > 50:
+            chapter_text += generate_summary(chapter_to_summarize)
+            chapter_text += "\n\n"
+    if not chapter_text:
+        chapter_to_summarize = extract_chapter_text(EPUB_PATH, [chapter_index+1])
+        if chapter_to_summarize and len(chapter_to_summarize) > 50:
+            chapter_text += generate_summary(chapter_to_summarize)
+            chapter_text += "\n\n"
+    return chapter_text if chapter_text else "No content found for the selected chapter."
+def generate_summary(text):
+    try:
+        client = InferenceClient(api_key=TOKEN)
+        user_prompt = (
+            "Provide a clear and concise summary of the chapter, emphasizing key events, themes, and character developments. "
+            "Do not include introductory or concluding remarks, just focus on the main points."
+            f"\n\nChapter Text:\n{text}"
+        )
+        system_message = {
+            "role": "system",
+            "content": (
+                "You are an expert at summarizing book chapters. Your task is to condense the chapter into a focused, "
+                "informative summary that highlights the most important events, themes, and character developments. "
+                "Avoid filler and ensure the summary is succinct yet comprehensive."
+            )
+        }
+        messages = [
+            system_message,
+            {"role": "user", "content": user_prompt}
+        ]
+        stream = client.chat.completions.create(
+            model=MODEL,
+            messages=messages,
+            temperature=0.5,
+            max_tokens=2048,
+            top_p=0.7,
+            stream=True
+        )
+        out = ""
+        for chunk in stream:
+            if chunk.choices and len(chunk.choices) > 0:
+                new_content = chunk.choices[0].delta.content
+                out += new_content
+                yield out
+        return out
+    except Exception as e:
+        print(f"Error generating summary: {e}")
+        return "Error generating summary."
+# Model Initialization
+MODEL = "meta-llama/Llama-3.3-70B-Instruct"
+TOKEN = os.getenv("TOKEN")
+# Gradio App
+with gr.Blocks() as app:
+    isbn_input = gr.Textbox(label="Enter ISBN")
+    chapter_dropdown = gr.Dropdown(label="Select Chapter", choices=[])
+    summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)
+    def update_chapter_dropdown(isbn):
+        fetch_book_details(isbn)
+        chapters = generate_table_of_contents()
+        print(chapters)
+        return gr.update(choices=[(title.strip('\''), line_number) for line_number, title in chapters.items()])
+    def stream_summarize_chapter(chapter_index):
+        if chapter_index < 0:
+            yield "Invalid chapter selection."
+            return
+        result = subprocess.run(['calibre-debug', '--run-plugin', 'EpubSplit', EPUB_PATH], capture_output=True, text=True)
+        pattern = re.compile(r'Line Number: (\d+)\n(?:\t.*\n)*\ttoc: \[(.*?)\]')
+        chapter = [int(line_number) for line_number, title in pattern.findall(result.stdout)]
+        if not chapter:
+            yield "No content found for the selected chapter."
+            return
+        for i in range(chapter_index, chapter[chapter.index(chapter_index) + 1]):
+            chapter_to_summarize = extract_chapter_text(EPUB_PATH, [i])
+            if chapter_to_summarize and len(chapter_to_summarize) > 50:
+                for text_chunk in generate_summary(chapter_to_summarize):
+                    yield text_chunk
+            else:
+                yield "No significant content found for this chapter."
+    isbn_input.change(update_chapter_dropdown, inputs=[isbn_input], outputs=[chapter_dropdown])
+    chapter_dropdown.change(
+        stream_summarize_chapter, inputs=[chapter_dropdown], outputs=[summary_output]
+    )
+    app.launch()

epubsplit.py ADDED Viewed

	@@ -0,0 +1,1367 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2021, Jim Miller'
+__docformat__ = 'restructuredtext en'
+import sys, re, os, traceback, copy
+from posixpath import normpath
+import logging
+logger = logging.getLogger(__name__)
+from zipfile import ZipFile, ZIP_STORED, ZIP_DEFLATED
+from xml.dom.minidom import parse, parseString, getDOMImplementation, Element
+from time import time
+import six
+from six.moves.urllib.parse import unquote
+from six import string_types, text_type as unicode
+from six import unichr
+from bs4 import BeautifulSoup
+## font decoding code lifted from
+## calibre/src/calibre/ebooks/conversion/plugins/epub_input.py
+## copyright '2009, Kovid Goyal <kovid@kovidgoyal.net>'
+## don't bug Kovid about this use of it.
+ADOBE_OBFUSCATION =  'http://ns.adobe.com/pdf/enc#RC'
+IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
+from itertools import cycle
+class FontDecrypter:
+    def __init__(self, epub, content_dom):
+        self.epub = epub
+        self.content_dom = content_dom
+        self.encryption = {}
+        self.old_uuid = None
+    def get_file(self,href):
+        return self.epub.read(href)
+    def get_encrypted_fontfiles(self):
+        if not self.encryption:
+            ## Find the .opf file.
+            try:
+                # <encryption xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
+                #             xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
+                #             xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
+                #   <enc:EncryptedData>
+                #     <enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
+                #     <enc:CipherData>
+                #       <enc:CipherReference URI="fonts/00017.ttf"/>
+                #     </enc:CipherData>
+                #   </enc:EncryptedData>
+                # </encryption>
+                encryption = self.epub.read("META-INF/encryption.xml")
+                encryptiondom = parseString(encryption)
+                # print(encryptiondom.toprettyxml(indent='   '))
+                for encdata in encryptiondom.getElementsByTagName('enc:EncryptedData'):
+                    # print(encdata.toprettyxml(indent='   '))
+                    algorithm = encdata.getElementsByTagName('enc:EncryptionMethod')[0].getAttribute('Algorithm')
+                    if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
+                        print("Unknown font encryption: %s"%algorithm)
+                    else:
+                        # print(algorithm)
+                        for encref in encdata.getElementsByTagName('enc:CipherReference'):
+                            # print(encref.getAttribute('URI'))
+                            self.encryption[encref.getAttribute('URI')]=algorithm
+            except KeyError as ke:
+                self.encryption = {}
+        return self.encryption
+    def get_old_uuid(self):
+        if not self.old_uuid:
+            contentdom = self.content_dom
+            uidkey = contentdom.getElementsByTagName("package")[0].getAttribute("unique-identifier")
+            for dcid in contentdom.getElementsByTagName("dc:identifier"):
+                if dcid.getAttribute("id") == uidkey and dcid.getAttribute("opf:scheme") == "uuid":
+                    self.old_uuid = dcid.firstChild.data
+        return self.old_uuid
+    def get_idpf_key(self):
+        # idpf key:urn:uuid:221c69fe-29f3-4cb4-bb3f-58c430261cc6
+        # idpf key:b'\xfb\xa9\x03N}\xae~\x12 \xaa\xe0\xc11\xe2\xe7\x1b\xf6\xa5\xcas'
+        idpf_key = self.get_old_uuid()
+        import uuid, hashlib
+        idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
+        idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
+        return idpf_key
+    def get_adobe_key(self):
+        # adobe key:221c69fe-29f3-4cb4-bb3f-58c430261cc6
+        # adobe key:b'"\x1ci\xfe)\xf3L\xb4\xbb?X\xc40&\x1c\xc6'
+        adobe_key = self.get_old_uuid()
+        import uuid
+        adobe_key = adobe_key.rpartition(':')[-1] # skip urn:uuid:
+        adobe_key = uuid.UUID(adobe_key).bytes
+        return adobe_key
+    def get_decrypted_font_data(self, uri):
+        # print(self.get_old_uuid())
+        # print("idpf : %s"%self.get_idpf_key())
+        # print("adobe: %s"%self.get_adobe_key())
+        # print("uri:%s"%uri)
+        font_data = self.get_file(uri)
+        if uri in self.get_encrypted_fontfiles():
+            key = self.get_adobe_key() if self.get_encrypted_fontfiles()[uri] == ADOBE_OBFUSCATION else self.get_idpf_key()
+            font_data =  self.decrypt_font_data(key, font_data, self.get_encrypted_fontfiles()[uri])
+        return font_data
+    def decrypt_font_data(self, key, data, algorithm):
+        is_adobe = algorithm == ADOBE_OBFUSCATION
+        crypt_len = 1024 if is_adobe else 1040
+        crypt = bytearray(data[:crypt_len])
+        key = cycle(iter(bytearray(key)))
+        decrypt = bytes(bytearray(x^next(key) for x in crypt))
+        return decrypt + data[crypt_len:]
+def _unirepl(match):
+    "Return the unicode string for a decimal number"
+    if match.group(1).startswith('x'):
+        radix=16
+        s = match.group(1)[1:]
+    else:
+        radix=10
+        s = match.group(1)
+    try:
+        value = int(s, radix)
+        retval = "%s%s"%(unichr(value),match.group(2))
+    except:
+        # This way, at least if there's more of entities out there
+        # that fail, it doesn't blow the entire download.
+        print("Numeric entity translation failed, skipping: &#x%s%s"%(match.group(1),match.group(2)))
+        retval = ""
+    return retval
+def _replaceNumberEntities(data):
+    # The same brokenish entity parsing in SGMLParser that inserts ';'
+    # after non-entities will also insert ';' incorrectly after number
+    # entities, including part of the next word if it's a-z.
+    # "Don't&#8212ever&#8212do&#8212that&#8212again," becomes
+    # "Don't&#8212e;ver&#8212d;o&#8212;that&#8212a;gain,"
+    # Also need to allow for 5 digit decimal entities &#27861;
+    # Last expression didn't allow for 2 digit hex correctly: &#xE9;
+    p = re.compile(r'&#(x[0-9a-fA-F]{,4}|[0-9]{,5})([0-9a-fA-F]*?);')
+    return p.sub(_unirepl, data)
+def _replaceNotEntities(data):
+    # not just \w or \S.  regexp from c:\Python25\lib\sgmllib.py
+    # (or equiv), SGMLParser, entityref
+    p = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
+    return p.sub(r'&\1', data)
+def stripHTML(soup):
+    return removeAllEntities(re.sub(r'<[^>]+>','',"%s" % soup)).strip()
+def conditionalRemoveEntities(value):
+    if isinstance(value,string_types) :
+        return removeEntities(value).strip()
+    else:
+        return value
+def removeAllEntities(text):
+    # Remove &lt; &lt; and &amp;
+    return removeEntities(text).replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
+def removeEntities(text):
+    if text is None:
+        return ""
+    if not (isinstance(text,string_types)):
+        return str(text)
+    try:
+        t = unicode(text) #.decode('utf-8')
+    except UnicodeEncodeError as e:
+        try:
+            t = text.encode ('ascii', 'xmlcharrefreplace')
+        except UnicodeEncodeError as e:
+            t = text
+    text = t
+    # replace numeric versions of [&<>] with named versions,
+    # then replace named versions with actual characters,
+    text = re.sub(r'&#0*38;','&amp;',text)
+    text = re.sub(r'&#0*60;','&lt;',text)
+    text = re.sub(r'&#0*62;','&gt;',text)
+    # replace remaining &#000; entities with unicode value, such as &#039; -> '
+    text = _replaceNumberEntities(text)
+    # replace several named entities with character, such as &mdash; -> -
+    # see constants.py for the list.
+    # reverse sort will put entities with ; before the same one without, when valid.
+    for e in reversed(sorted(entities.keys())):
+        v = entities[e]
+        try:
+            text = text.replace(e, v)
+        except UnicodeDecodeError as ex:
+            # for the pound symbol in constants.py
+            text = text.replace(e, v.decode('utf-8'))
+    # SGMLParser, and in turn, BeautifulStoneSoup doesn't parse
+    # entities terribly well and inserts (;) after something that
+    # it thinks might be an entity.  AT&T becomes AT&T; All of my
+    # attempts to fix this by changing the input to
+    # BeautifulStoneSoup break something else instead.  But at
+    # this point, there should be *no* real entities left, so find
+    # these not-entities and removing them here should be safe.
+    text = _replaceNotEntities(text)
+    # &lt; &lt; and &amp; are the only html entities allowed in xhtml, put those back.
+    return text.replace('&', '&amp;').replace('&amp;lt', '&lt;').replace('&amp;gt', '&gt;')
+# entity list from http://code.google.com/p/doctype/wiki/CharacterEntitiesConsistent
+entities = { '&aacute;' : 'á',
+         '&Aacute;' : 'Á',
+         '&Aacute' : 'Á',
+         '&aacute' : 'á',
+         '&acirc;' : 'â',
+         '&Acirc;' : 'Â',
+         '&Acirc' : 'Â',
+         '&acirc' : 'â',
+         '&acute;' : '´',
+         '&acute' : '´',
+         '&AElig;' : 'Æ',
+         '&aelig;' : 'æ',
+         '&AElig' : 'Æ',
+         '&aelig' : 'æ',
+         '&agrave;' : 'à',
+         '&Agrave;' : 'À',
+         '&Agrave' : 'À',
+         '&agrave' : 'à',
+         '&alefsym;' : 'ℵ',
+         '&alpha;' : 'α',
+         '&Alpha;' : 'Α',
+         '&amp;' : '&',
+         '&AMP;' : '&',
+         '&AMP' : '&',
+         '&amp' : '&',
+         '&and;' : '∧',
+         '&ang;' : '∠',
+         '&aring;' : 'å',
+         '&Aring;' : 'Å',
+         '&Aring' : 'Å',
+         '&aring' : 'å',
+         '&asymp;' : '≈',
+         '&atilde;' : 'ã',
+         '&Atilde;' : 'Ã',
+         '&Atilde' : 'Ã',
+         '&atilde' : 'ã',
+         '&auml;' : 'ä',
+         '&Auml;' : 'Ä',
+         '&Auml' : 'Ä',
+         '&auml' : 'ä',
+         '&bdquo;' : '„',
+         '&beta;' : 'β',
+         '&Beta;' : 'Β',
+         '&brvbar;' : '¦',
+         '&brvbar' : '¦',
+         '&bull;' : '•',
+         '&cap;' : '∩',
+         '&ccedil;' : 'ç',
+         '&Ccedil;' : 'Ç',
+         '&Ccedil' : 'Ç',
+         '&ccedil' : 'ç',
+         '&cedil;' : '¸',
+         '&cedil' : '¸',
+         '&cent;' : '¢',
+         '&cent' : '¢',
+         '&chi;' : 'χ',
+         '&Chi;' : 'Χ',
+         '&circ;' : 'ˆ',
+         '&clubs;' : '♣',
+         '&cong;' : '≅',
+         '&copy;' : '©',
+         '&COPY;' : '©',
+         '&COPY' : '©',
+         '&copy' : '©',
+         '&crarr;' : '↵',
+         '&cup;' : '∪',
+         '&curren;' : '¤',
+         '&curren' : '¤',
+         '&dagger;' : '†',
+         '&Dagger;' : '‡',
+         '&darr;' : '↓',
+         '&dArr;' : '⇓',
+         '&deg;' : '°',
+         '&deg' : '°',
+         '&delta;' : 'δ',
+         '&Delta;' : 'Δ',
+         '&diams;' : '♦',
+         '&divide;' : '÷',
+         '&divide' : '÷',
+         '&eacute;' : 'é',
+         '&Eacute;' : 'É',
+         '&Eacute' : 'É',
+         '&eacute' : 'é',
+         '&ecirc;' : 'ê',
+         '&Ecirc;' : 'Ê',
+         '&Ecirc' : 'Ê',
+         '&ecirc' : 'ê',
+         '&egrave;' : 'è',
+         '&Egrave;' : 'È',
+         '&Egrave' : 'È',
+         '&egrave' : 'è',
+         '&empty;' : '∅',
+         '&emsp;' : ' ',
+         '&ensp;' : ' ',
+         '&epsilon;' : 'ε',
+         '&Epsilon;' : 'Ε',
+         '&equiv;' : '≡',
+         '&eta;' : 'η',
+         '&Eta;' : 'Η',
+         '&eth;' : 'ð',
+         '&ETH;' : 'Ð',
+         '&ETH' : 'Ð',
+         '&eth' : 'ð',
+         '&euml;' : 'ë',
+         '&Euml;' : 'Ë',
+         '&Euml' : 'Ë',
+         '&euml' : 'ë',
+         '&euro;' : '€',
+         '&exist;' : '∃',
+         '&fnof;' : 'ƒ',
+         '&forall;' : '∀',
+         '&frac12;' : '½',
+         '&frac12' : '½',
+         '&frac14;' : '¼',
+         '&frac14' : '¼',
+         '&frac34;' : '¾',
+         '&frac34' : '¾',
+         '&frasl;' : '⁄',
+         '&gamma;' : 'γ',
+         '&Gamma;' : 'Γ',
+         '&ge;' : '≥',
+         #'&gt;' : '>',
+         #'&GT;' : '>',
+         #'&GT' : '>',
+         #'&gt' : '>',
+         '&harr;' : '↔',
+         '&hArr;' : '⇔',
+         '&hearts;' : '♥',
+         '&hellip;' : '…',
+         '&iacute;' : 'í',
+         '&Iacute;' : 'Í',
+         '&Iacute' : 'Í',
+         '&iacute' : 'í',
+         '&icirc;' : 'î',
+         '&Icirc;' : 'Î',
+         '&Icirc' : 'Î',
+         '&icirc' : 'î',
+         '&iexcl;' : '¡',
+         '&iexcl' : '¡',
+         '&igrave;' : 'ì',
+         '&Igrave;' : 'Ì',
+         '&Igrave' : 'Ì',
+         '&igrave' : 'ì',
+         '&image;' : 'ℑ',
+         '&infin;' : '∞',
+         '&int;' : '∫',
+         '&iota;' : 'ι',
+         '&Iota;' : 'Ι',
+         '&iquest;' : '¿',
+         '&iquest' : '¿',
+         '&isin;' : '∈',
+         '&iuml;' : 'ï',
+         '&Iuml;' : 'Ï',
+         '&Iuml' : 'Ï',
+         '&iuml' : 'ï',
+         '&kappa;' : 'κ',
+         '&Kappa;' : 'Κ',
+         '&lambda;' : 'λ',
+         '&Lambda;' : 'Λ',
+         '&laquo;' : '«',
+         '&laquo' : '«',
+         '&larr;' : '←',
+         '&lArr;' : '⇐',
+         '&lceil;' : '⌈',
+         '&ldquo;' : '“',
+         '&le;' : '≤',
+         '&lfloor;' : '⌊',
+         '&lowast;' : '∗',
+         '&loz;' : '◊',
+         '&lrm;' : '‎',
+         '&lsaquo;' : '‹',
+         '&lsquo;' : '‘',
+         #'&lt;' : '<',
+         #'&LT;' : '<',
+         #'&LT' : '<',
+         #'&lt' : '<',
+         '&macr;' : '¯',
+         '&macr' : '¯',
+         '&mdash;' : '—',
+         '&micro;' : 'µ',
+         '&micro' : 'µ',
+         '&middot;' : '·',
+         '&middot' : '·',
+         '&minus;' : '−',
+         '&mu;' : 'μ',
+         '&Mu;' : 'Μ',
+         '&nabla;' : '∇',
+         '&nbsp;' : ' ',
+         '&nbsp' : ' ',
+         '&ndash;' : '–',
+         '&ne;' : '≠',
+         '&ni;' : '∋',
+         '&not;' : '¬',
+         '&not' : '¬',
+         '&notin;' : '∉',
+         '&nsub;' : '⊄',
+         '&ntilde;' : 'ñ',
+         '&Ntilde;' : 'Ñ',
+         '&Ntilde' : 'Ñ',
+         '&ntilde' : 'ñ',
+         '&nu;' : 'ν',
+         '&Nu;' : 'Ν',
+         '&oacute;' : 'ó',
+         '&Oacute;' : 'Ó',
+         '&Oacute' : 'Ó',
+         '&oacute' : 'ó',
+         '&ocirc;' : 'ô',
+         '&Ocirc;' : 'Ô',
+         '&Ocirc' : 'Ô',
+         '&ocirc' : 'ô',
+         '&OElig;' : 'Œ',
+         '&oelig;' : 'œ',
+         '&ograve;' : 'ò',
+         '&Ograve;' : 'Ò',
+         '&Ograve' : 'Ò',
+         '&ograve' : 'ò',
+         '&oline;' : '‾',
+         '&omega;' : 'ω',
+         '&Omega;' : 'Ω',
+         '&omicron;' : 'ο',
+         '&Omicron;' : 'Ο',
+         '&oplus;' : '⊕',
+         '&or;' : '∨',
+         '&ordf;' : 'ª',
+         '&ordf' : 'ª',
+         '&ordm;' : 'º',
+         '&ordm' : 'º',
+         '&oslash;' : 'ø',
+         '&Oslash;' : 'Ø',
+         '&Oslash' : 'Ø',
+         '&oslash' : 'ø',
+         '&otilde;' : 'õ',
+         '&Otilde;' : 'Õ',
+         '&Otilde' : 'Õ',
+         '&otilde' : 'õ',
+         '&otimes;' : '⊗',
+         '&ouml;' : 'ö',
+         '&Ouml;' : 'Ö',
+         '&Ouml' : 'Ö',
+         '&ouml' : 'ö',
+         '&para;' : '¶',
+         '&para' : '¶',
+         '&part;' : '∂',
+         '&permil;' : '‰',
+         '&perp;' : '⊥',
+         '&phi;' : 'φ',
+         '&Phi;' : 'Φ',
+         '&pi;' : 'π',
+         '&Pi;' : 'Π',
+         '&piv;' : 'ϖ',
+         '&plusmn;' : '±',
+         '&plusmn' : '±',
+         '&pound;' : '£',
+         '&pound' : '£',
+         '&prime;' : '′',
+         '&Prime;' : '″',
+         '&prod;' : '∏',
+         '&prop;' : '∝',
+         '&psi;' : 'ψ',
+         '&Psi;' : 'Ψ',
+         '&quot;' : '"',
+         '&QUOT;' : '"',
+         '&QUOT' : '"',
+         '&quot' : '"',
+         '&radic;' : '√',
+         '&raquo;' : '»',
+         '&raquo' : '»',
+         '&rarr;' : '→',
+         '&rArr;' : '⇒',
+         '&rceil;' : '⌉',
+         '&rdquo;' : '”',
+         '&real;' : 'ℜ',
+         '&reg;' : '®',
+         '&REG;' : '®',
+         '&REG' : '®',
+         '&reg' : '®',
+         '&rfloor;' : '⌋',
+         '&rho;' : 'ρ',
+         '&Rho;' : 'Ρ',
+         '&rlm;' : '‏',
+         '&rsaquo;' : '›',
+         '&rsquo;' : '’',
+         '&sbquo;' : '‚',
+         '&scaron;' : 'š',
+         '&Scaron;' : 'Š',
+         '&sdot;' : '⋅',
+         '&sect;' : '§',
+         '&sect' : '§',
+         '&shy;' : '', # strange optional hyphenation control character, not just a dash
+         '&shy' : '',
+         '&sigma;' : 'σ',
+         '&Sigma;' : 'Σ',
+         '&sigmaf;' : 'ς',
+         '&sim;' : '∼',
+         '&spades;' : '♠',
+         '&sub;' : '⊂',
+         '&sube;' : '⊆',
+         '&sum;' : '∑',
+         '&sup1;' : '¹',
+         '&sup1' : '¹',
+         '&sup2;' : '²',
+         '&sup2' : '²',
+         '&sup3;' : '³',
+         '&sup3' : '³',
+         '&sup;' : '⊃',
+         '&supe;' : '⊇',
+         '&szlig;' : 'ß',
+         '&szlig' : 'ß',
+         '&tau;' : 'τ',
+         '&Tau;' : 'Τ',
+         '&there4;' : '∴',
+         '&theta;' : 'θ',
+         '&Theta;' : 'Θ',
+         '&thetasym;' : 'ϑ',
+         '&thinsp;' : ' ',
+         '&thorn;' : 'þ',
+         '&THORN;' : 'Þ',
+         '&THORN' : 'Þ',
+         '&thorn' : 'þ',
+         '&tilde;' : '˜',
+         '&times;' : '×',
+         '&times' : '×',
+         '&trade;' : '™',
+         '&uacute;' : 'ú',
+         '&Uacute;' : 'Ú',
+         '&Uacute' : 'Ú',
+         '&uacute' : 'ú',
+         '&uarr;' : '↑',
+         '&uArr;' : '⇑',
+         '&ucirc;' : 'û',
+         '&Ucirc;' : 'Û',
+         '&Ucirc' : 'Û',
+         '&ucirc' : 'û',
+         '&ugrave;' : 'ù',
+         '&Ugrave;' : 'Ù',
+         '&Ugrave' : 'Ù',
+         '&ugrave' : 'ù',
+         '&uml;' : '¨',
+         '&uml' : '¨',
+         '&upsih;' : 'ϒ',
+         '&upsilon;' : 'υ',
+         '&Upsilon;' : 'Υ',
+         '&uuml;' : 'ü',
+         '&Uuml;' : 'Ü',
+         '&Uuml' : 'Ü',
+         '&uuml' : 'ü',
+         '&weierp;' : '℘',
+         '&xi;' : 'ξ',
+         '&Xi;' : 'Ξ',
+         '&yacute;' : 'ý',
+         '&Yacute;' : 'Ý',
+         '&Yacute' : 'Ý',
+         '&yacute' : 'ý',
+         '&yen;' : '¥',
+         '&yen' : '¥',
+         '&yuml;' : 'ÿ',
+         '&Yuml;' : 'Ÿ',
+         '&yuml' : 'ÿ',
+         '&zeta;' : 'ζ',
+         '&Zeta;' : 'Ζ',
+         '&zwj;' : '‍',  # strange spacing control character, not just a space
+         '&zwnj;' : '‌',  # strange spacing control character, not just a space
+         }
+class SplitEpub:
+    def __init__(self, inputio):
+        self.epub = ZipFile(inputio, 'r')
+        self.content_dom = None
+        self.content_relpath = None
+        self.manifest_items = None
+        self.guide_items = None
+        self.toc_dom = None
+        self.toc_relpath = None
+        self.toc_map = None
+        self.split_lines = None
+        self.origauthors = []
+        self.origtitle = None
+    def get_file(self,href):
+        return self.epub.read(href)
+    def get_content_dom(self):
+        if not self.content_dom:
+            ## Find the .opf file.
+            container = self.epub.read("META-INF/container.xml")
+            containerdom = parseString(container)
+            rootfilenodelist = containerdom.getElementsByTagName("rootfile")
+            rootfilename = rootfilenodelist[0].getAttribute("full-path")
+            self.content_dom = parseString(self.epub.read(rootfilename))
+            self.content_relpath = get_path_part(rootfilename)
+        return self.content_dom
+    def get_content_relpath(self):
+        ## Save the path to the .opf file--hrefs inside it are relative to it.
+        if not self.content_relpath:
+            self.get_content_dom() # sets self.content_relpath also.
+        return self.content_relpath
+    def get_toc_relpath(self):
+        ## Save the path to the toc.ncx file--hrefs inside it are relative to it.
+        if not self.toc_relpath:
+            self.get_manifest_items() # sets self.toc_relpath also.
+        return self.toc_relpath
+    def get_manifest_items(self):
+        if not self.manifest_items:
+            self.manifest_items = {}
+            for item in self.get_content_dom().getElementsByTagName("item"):
+                fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
+                #print("---- item fullhref:%s"%(fullhref))
+                self.manifest_items["h:"+fullhref]=(item.getAttribute("id"),item.getAttribute("media-type"))
+                self.manifest_items["i:"+item.getAttribute("id")]=(fullhref,item.getAttribute("media-type"))
+                if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
+                    # TOC file is only one with this type--as far as I know.
+                    self.toc_relpath = get_path_part(fullhref)
+                    self.toc_dom = parseString(self.epub.read(fullhref))
+        return self.manifest_items
+    def get_guide_items(self):
+        if not self.guide_items:
+            self.guide_items = {}
+            for item in self.get_content_dom().getElementsByTagName("reference"):
+                fullhref=normpath(unquote(self.get_content_relpath()+item.getAttribute("href")))
+                self.guide_items[fullhref]=(item.getAttribute("type"),item.getAttribute("title"))
+                #print("---- reference href:%s value:%s"%(fullhref,self.guide_items[fullhref],))
+                #self.guide_items[item.getAttribute("type")]=(fullhref,item.getAttribute("media-type"))
+        return self.guide_items
+    def get_toc_dom(self):
+        if not self.toc_dom:
+            self.get_manifest_items() # also sets self.toc_dom
+        return self.toc_dom
+    # dict() of href->[(text,anchor),...],...
+    # eg: "file0001.html"->[("Introduction","anchor01"),("Chapter 1","anchor02")],...
+    def get_toc_map(self):
+        if not self.toc_map:
+            self.toc_map = {}
+            # update all navpoint ids with bookid for uniqueness.
+            for navpoint in self.get_toc_dom().getElementsByTagName("navPoint"):
+                src = normpath(unquote(self.get_toc_relpath()+navpoint.getElementsByTagName("content")[0].getAttribute("src")))
+                if '#' in src:
+                    (href,anchor)=src.split("#")
+                else:
+                    (href,anchor)=(src,None)
+                # The first of these in each navPoint should be the appropriate one.
+                # (may be others due to nesting.
+                try:
+                    text = unicode(navpoint.getElementsByTagName("text")[0].firstChild.data)
+                except:
+                    #print("No chapter title found in TOC for (%s)"%src)
+                    text = ""
+                if href not in self.toc_map:
+                    self.toc_map[href] = []
+                if anchor == None:
+                    # put file links ahead of ancher links.  Otherwise
+                    # a non-linear anchor link may take precedence,
+                    # which will confuse EpubSplit.  This will cause
+                    # split lines to possibly be out of order from
+                    # TOC, but the alternative is worse.  Should be a
+                    # rare corner case.
+                    ## Keep order of non-anchor entries to the same file.
+                    idx=0
+                    while idx < len(self.toc_map[href]) and self.toc_map[href][idx][1] is None: # [1] is anchor
+                        # print(idx)
+                        # print(self.toc_map[href][idx])
+                        idx = idx+1
+                    self.toc_map[href].insert(idx,(text,anchor))
+                else:
+                    self.toc_map[href].append((text,anchor))
+            # print(self.toc_map)
+        return self.toc_map
+    # list of dicts with href, anchor & toc text.
+    # 'split lines' are all the points that the epub can be split on.
+    # Offer a split at each spine file and each ToC point.
+    def get_split_lines(self):
+        metadom = self.get_content_dom()
+        ## Save indiv book title
+        try:
+            self.origtitle = metadom.getElementsByTagName("dc:title")[0].firstChild.data
+        except:
+            self.origtitle = "(Title Missing)"
+        ## Save authors.
+        for creator in metadom.getElementsByTagName("dc:creator"):
+            try:
+                if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
+                    if creator.firstChild.data not in self.origauthors:
+                        self.origauthors.append(creator.firstChild.data)
+            except:
+                pass
+        if len(self.origauthors) == 0:
+            self.origauthors.append("(Authors Missing)")
+        self.split_lines = [] # list of dicts with href, anchor and toc
+        # spin on spine files.
+        count=0
+        for itemref in metadom.getElementsByTagName("itemref"):
+            idref = itemref.getAttribute("idref")
+            (href,type) = self.get_manifest_items()["i:"+idref]
+            current = {}
+            self.split_lines.append(current)
+            current['href']=href
+            current['anchor']=None
+            current['toc'] = []
+            if href in self.get_guide_items():
+                current['guide'] = self.get_guide_items()[href]
+            current['id'] = idref
+            current['type'] = type
+            current['num'] = count
+            t=self.epub.read(href).decode('utf-8')
+            if len(t) > 1500 : t = t[:1500] + "..."
+            current['sample']=t
+            count += 1
+            #print("spine:%s->%s"%(idref,href))
+            # if href is in the toc.
+            if href in self.get_toc_map():
+                # For each toc entry, check to see if there's an anchor, if so,
+                # make a new split line.
+                for tocitem in self.get_toc_map()[href]:
+                    (text,anchor) = tocitem
+                    # XXX for outputing to screen in CLI--hopefully won't need in plugin?
+                    try:
+                        text = "%s"%text
+                    except:
+                        text = "(error text)"
+                    if anchor:
+                        #print("breakpoint: %d"%count)
+                        current = {}
+                        self.split_lines.append(current)
+                        current['href']=href
+                        current['anchor']=anchor
+                        current['toc']=[]
+                        current['id'] = idref
+                        current['type'] = type
+                        current['num'] = count
+                        # anchor, need to split first, then reduce to 1500.
+                        t=splitHtml(self.epub.read(href).decode('utf-8'),anchor,before=False)
+                        if len(t) > 1500 : t = t[:1500] + "..."
+                        current['sample']=t
+                        count += 1
+                    # There can be more than one toc to the same split line.
+                    # This won't find multiple toc to the same anchor yet.
+                    current['toc'].append(text)
+                    #print("\ttoc:'%s' %s#%s"%(text,href,anchor))
+        return self.split_lines
+    # pass in list of line numbers(?)
+    def get_split_files(self,linenums):
+        self.filecache = FileCache(self.get_manifest_items())
+        # set include flag in split_lines.
+        if not self.split_lines:
+            self.get_split_lines()
+        lines = self.split_lines
+        lines_set = set([int(k) for k in linenums])
+        for j in range(len(lines)):
+            lines[j]['include'] = j in lines_set
+        # loop through finding 'chunks' -- contiguous pieces in the
+        # same file.  Each included file is at least one chunk, but if
+        # parts are left out, one original file can end up being more
+        # than one chunk.
+        outchunks = [] # list of tuples=(filename,start,end) 'end' is not inclusive.
+        inchunk = False
+        currentfile = None
+        start = None
+        for line in lines:
+            if line['include']:
+                if not inchunk: # start new chunk
+                    inchunk = True
+                    currentfile = line['href']
+                    start = line
+                else: # inchunk
+                    # different file, new chunk.
+                    if currentfile != line['href']:
+                        outchunks.append((currentfile,start,line))
+                        inchunk=True
+                        currentfile=line['href']
+                        start=line
+            else: # not include
+                if inchunk: # save previous chunk.
+                    outchunks.append((currentfile,start,line))
+                    inchunk=False
+        # final chunk for when last in list is include.
+        if inchunk:
+            outchunks.append((currentfile,start,None))
+        outfiles=[]  # tuples, (filename,type,data) -- filename changed to unique
+        for (href,start,end) in outchunks:
+            filedata = self.epub.read(href).decode('utf-8')
+            # discard before start if anchor.
+            if start['anchor'] != None:
+                filedata = splitHtml(filedata,start['anchor'],before=False)
+            # discard from end anchor on(inclusive), but only if same file.  If
+            # different file, keep rest of file.  If no 'end', then it was the
+            # last chunk and went to the end of the last file.
+            if end != None and end['anchor'] != None and end['href']==href:
+                filedata = splitHtml(filedata,end['anchor'],before=True)
+            filename = self.filecache.add_content_file(href,filedata)
+            outfiles.append([filename,start['id'],start['type'],filedata])
+        # print("self.oldnew:%s"%self.filecache.oldnew)
+        # print("self.newold:%s"%self.filecache.newold)
+        # print("\nanchors:%s\n"%self.filecache.anchors)
+        # print("\nlinkedfiles:%s\n"%self.filecache.linkedfiles)
+        # print("relpath:%s"%get_path_part())
+        # Spin through to replace internal URLs
+        for fl in outfiles:
+            #print("file:%s"%fl[0])
+            soup = BeautifulSoup(fl[3],'html5lib')
+            changed = False
+            for a in soup.findAll('a'):
+                if a.has_attr('href'):
+                    path = normpath(unquote("%s%s"%(get_path_part(fl[0]),a['href'])))
+                    #print("full a['href']:%s"%path)
+                    if path in self.filecache.anchors and self.filecache.anchors[path] != path:
+                        a['href'] = self.filecache.anchors[path][len(get_path_part(fl[0])):]
+                        #print("replacement path:%s"%a['href'])
+                        changed = True
+            if changed:
+                fl[3] = unicode(soup)
+        return outfiles
+    def write_split_epub(self,
+                         outputio,
+                         linenums,
+                         changedtocs={},
+                         authoropts=[],
+                         titleopt=None,
+                         descopt=None,
+                         tags=[],
+                         languages=['en'],
+                         coverjpgpath=None):
+        files = self.get_split_files(linenums)
+        ## Write mimetype file, must be first and uncompressed.
+        ## Older versions of python(2.4/5) don't allow you to specify
+        ## compression by individual file.
+        ## Overwrite if existing output file.
+        outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
+        outputepub.debug = 3
+        outputepub.writestr("mimetype", "application/epub+zip")
+        outputepub.close()
+        ## Re-open file for content.
+        outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
+        outputepub.debug = 3
+        ## Create META-INF/container.xml file.  The only thing it does is
+        ## point to content.opf
+        containerdom = getDOMImplementation().createDocument(None, "container", None)
+        containertop = containerdom.documentElement
+        containertop.setAttribute("version","1.0")
+        containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
+        rootfiles = containerdom.createElement("rootfiles")
+        containertop.appendChild(rootfiles)
+        rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
+                                                              "media-type":"application/oebps-package+xml"}))
+        outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))
+####    ## create content.opf file.
+        uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme.
+        contentdom = getDOMImplementation().createDocument(None, "package", None)
+        package = contentdom.documentElement
+        package.setAttribute("version","2.0")
+        package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
+        package.setAttribute("unique-identifier","epubsplit-id")
+        metadata=newTag(contentdom,"metadata",
+                        attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
+                               "xmlns:opf":"http://www.idpf.org/2007/opf"})
+        metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"}))
+        if( titleopt is None ):
+            titleopt = self.origtitle+" Split"
+        metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))
+        if( authoropts and len(authoropts) > 0  ):
+            useauthors=authoropts
+        else:
+            useauthors=self.origauthors
+        usedauthors=dict()
+        for author in useauthors:
+            if( author not in usedauthors ):
+                usedauthors[author]=author
+                metadata.appendChild(newTag(contentdom,"dc:creator",
+                                            attrs={"opf:role":"aut"},
+                                            text=author))
+        metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"}))
+        metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))
+        if languages:
+            for l in languages:
+                metadata.appendChild(newTag(contentdom,"dc:language",text=l))
+        else:
+            metadata.appendChild(newTag(contentdom,"dc:language",text="en"))
+        if not descopt:
+            # created now, but not filled in until TOC generation to save loops.
+            description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors)))
+        else:
+            description = newTag(contentdom,"dc:description",text=descopt)
+        metadata.appendChild(description)
+        for tag in tags:
+            metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))
+        package.appendChild(metadata)
+        manifest = contentdom.createElement("manifest")
+        package.appendChild(manifest)
+        spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
+        package.appendChild(spine)
+        manifest.appendChild(newTag(contentdom,"item",
+                                    attrs={'id':'ncx',
+                                           'href':'toc.ncx',
+                                           'media-type':'application/x-dtbncx+xml'}))
+        if coverjpgpath:
+            # <meta name="cover" content="cover.jpg"/>
+            metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
+                                                           "content":"coverimageid"}))
+            # cover stuff for later:
+            # at end of <package>:
+            # <guide>
+            # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
+            # </guide>
+            guide = newTag(contentdom,"guide")
+            guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
+                                                       "title":"Cover",
+                                                       "href":"cover.xhtml"}))
+            package.appendChild(guide)
+            manifest.appendChild(newTag(contentdom,"item",
+                                        attrs={'id':"coverimageid",
+                                               'href':"cover.jpg",
+                                               'media-type':"image/jpeg"}))
+            # Note that the id of the cover xhmtl *must* be 'cover'
+            # for it to work on Nook.
+            manifest.appendChild(newTag(contentdom,"item",
+                                        attrs={'id':"cover",
+                                               'href':"cover.xhtml",
+                                               'media-type':"application/xhtml+xml"}))
+            spine.appendChild(newTag(contentdom,"itemref",
+                                     attrs={"idref":"cover",
+                                            "linear":"yes"}))
+        contentcount=0
+        for (filename,id,type,filedata) in files:
+            #filename = self.filecache.addHtml(href,filedata)
+            #print("writing :%s"%filename)
+            # add to manifest and spine
+            if coverjpgpath and filename == "cover.xhtml":
+                continue # don't dup cover.
+            outputepub.writestr(filename,filedata.encode('utf-8'))
+            id = "a%d"%contentcount
+            contentcount += 1
+            manifest.appendChild(newTag(contentdom,"item",
+                                        attrs={'id':id,
+                                               'href':filename,
+                                               'media-type':type}))
+            spine.appendChild(newTag(contentdom,"itemref",
+                                     attrs={"idref":id,
+                                            "linear":"yes"}))
+        fontdecrypter = FontDecrypter(self.epub,self.get_content_dom())
+        linked=''
+        for (linked,type) in self.filecache.linkedfiles:
+            # print("linked files:(%s,%s)"%(linked,type))
+            # add to manifest
+            if coverjpgpath and linked == "cover.jpg":
+                continue # don't dup cover.
+            try:
+                linkeddata = self.get_file(linked)
+                if linked in fontdecrypter.get_encrypted_fontfiles():
+                    print("Decrypting font file: %s"%linked)
+                    linkeddata = fontdecrypter.get_decrypted_font_data(linked)
+                outputepub.writestr(linked,linkeddata)
+            except Exception as e:
+                print("Skipping linked file (%s)\nException: %s"%(linked,e))
+            id = "a%d"%contentcount
+            contentcount += 1
+            manifest.appendChild(newTag(contentdom,"item",
+                                        attrs={'id':id,
+                                               'href':linked,
+                                               'media-type':type}))
+        contentxml = contentdom.toprettyxml(indent='   ') # ,encoding='utf-8'
+        # tweak for brain damaged Nook STR.  Nook insists on name before content.
+        contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>',
+                                        '<meta name="cover" content="coverimageid"/>')
+        outputepub.writestr("content.opf",contentxml)
+        ## create toc.ncx file
+        tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
+        ncx = tocncxdom.documentElement
+        ncx.setAttribute("version","2005-1")
+        ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
+        head = tocncxdom.createElement("head")
+        ncx.appendChild(head)
+        head.appendChild(newTag(tocncxdom,"meta",
+                                attrs={"name":"dtb:uid", "content":uniqueid}))
+        depthnode = newTag(tocncxdom,"meta",
+                                attrs={"name":"dtb:depth", "content":"1"})
+        head.appendChild(depthnode)
+        head.appendChild(newTag(tocncxdom,"meta",
+                                attrs={"name":"dtb:totalPageCount", "content":"0"}))
+        head.appendChild(newTag(tocncxdom,"meta",
+                                attrs={"name":"dtb:maxPageNumber", "content":"0"}))
+        docTitle = tocncxdom.createElement("docTitle")
+        docTitle.appendChild(newTag(tocncxdom,"text",text=stripHTML(titleopt)))
+        ncx.appendChild(docTitle)
+        tocnavMap = tocncxdom.createElement("navMap")
+        ncx.appendChild(tocnavMap)
+        # come back to lines again for TOC because files only has files(gasp-shock!)
+        count=1
+        for line in self.split_lines:
+            if line['include']:
+                # if changed, use only changed values.
+                if line['num'] in changedtocs:
+                    line['toc'] = changedtocs[line['num']]
+                # can have more than one toc entry.
+                for title in line['toc']:
+                    newnav = newTag(tocncxdom,"navPoint",
+                                    {"id":"a%03d"%count,"playOrder":"%d" % count})
+                    count += 1
+                    tocnavMap.appendChild(newnav)
+                    navlabel = newTag(tocncxdom,"navLabel")
+                    newnav.appendChild(navlabel)
+                    # For purposes of TOC titling & desc, use first book author
+                    navlabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title)))
+                    # Find the first 'spine' item's content for the title navpoint.
+                    # Many epubs have the first chapter as first navpoint, so we can't just
+                    # copy that anymore.
+                    if line['anchor'] and line['href']+"#"+line['anchor'] in self.filecache.anchors:
+                        src = self.filecache.anchors[line['href']+"#"+line['anchor']]
+                        #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src))
+                    else:
+                        #print("toc from href(%s)"%line['href'])
+                        src = line['href']
+                    newnav.appendChild(newTag(tocncxdom,"content",
+                                              {"src":src}))
+        outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent='   ',encoding='utf-8'))
+        if coverjpgpath:
+            # write, not write string.  Pulling from file.
+            outputepub.write(coverjpgpath,"cover.jpg")
+            outputepub.writestr("cover.xhtml",'''
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
+@page {padding: 0pt; margin:0pt}
+body { text-align: center; padding:0pt; margin: 0pt; }
+div { margin: 0pt; padding: 0pt; }
+</style></head><body><div>
+<img src="cover.jpg" alt="cover"/>
+</div></body></html>
+''')
+	# declares all the files created by Windows.  otherwise, when
+        # it runs in appengine, windows unzips the files as 000 perms.
+        for zf in outputepub.filelist:
+            zf.create_system = 0
+        outputepub.close()
+class FileCache:
+    def __init__(self,manifest_items={}):
+        self.manifest_items = manifest_items
+        self.oldnew = {}
+        self.newold = {}
+        self.anchors = {}
+        self.linkedfiles = set()
+        ## always include font files for embedded fonts
+        for key, value in six.iteritems(self.manifest_items):
+            # print("manifest:%s %s"%(key,value))
+            if key.startswith('i:') and value[1] in ('application/vnd.ms-opentype',
+                                                     'application/x-font-ttf',
+                                                     'application/x-font-truetype',
+                                                     'application/font-sfnt'):
+                self.add_linked_file(value[0])
+    def add_linked_file(self, href):
+        href = normpath(unquote(href)) # fix %20 & /../
+        if ("h:"+href) in self.manifest_items:
+            type = self.manifest_items["h:"+href][1]
+        else:
+            type = 'unknown'
+        self.linkedfiles.add((href,type))
+    def add_content_file(self, href, filedata):
+        changedname = False
+        if href not in self.oldnew:
+            self.oldnew[href]=[]
+            newfile = href
+        else:
+            changedname = True
+            newfile = "%s%d-%s"%(get_path_part(href),
+                                 len(self.oldnew[href]),
+                                 get_file_part(href))
+        self.oldnew[href].append(newfile)
+        self.newold[newfile]=href
+        #print("newfile:%s"%newfile)
+        soup = BeautifulSoup(filedata,'html5lib')
+        #print("soup head:%s"%soup.find('head'))
+        # same name?  Don't need to worry about changing links to anchors
+        for a in soup.findAll(): # not just 'a', any tag.
+            #print("a:%s"%a)
+            if a.has_attr('id'):
+                self.anchors[href+'#'+a['id']]=newfile+'#'+a['id']
+        # <image> from baen epub.
+        # <image width="462" height="616" xlink:href="cover.jpeg"/>
+        for img in soup.findAll('img') + soup.findAll('image'):
+            src = None
+            if img.has_attr('src'):
+                src=img['src']
+            if img.has_attr('xlink:href'):
+                src=img['xlink:href']
+            if src:
+                self.add_linked_file(get_path_part(href)+src)
+            else:
+                logger.info("img tag without src in file:(%s) tag:(%s)"%(href,img))
+        # link href="0.css" type="text/css"
+        for style in soup.findAll('link',{'type':'text/css'}):
+            #print("link:%s"%style)
+            if style.has_attr('href'):
+                self.add_linked_file(get_path_part(href)+style['href'])
+        return newfile
+def splitHtml(data,tagid,before=False):
+    soup = BeautifulSoup(data,'lxml')
+    #print("splitHtml.soup head:%s"%soup.find('head'))
+    splitpoint = soup.find(id=tagid)
+    #print("splitpoint:%s"%splitpoint)
+    if splitpoint == None:
+        return data
+    if before:
+        # remove all next siblings.
+        for n in splitpoint.findNextSiblings():
+            n.extract()
+        parent = splitpoint.parent
+        while parent and parent.name != 'body':
+            for n in parent.findNextSiblings():
+                n.extract()
+            parent = parent.parent
+        splitpoint.extract()
+    else:
+        # remove all prev siblings.
+        for n in splitpoint.findPreviousSiblings():
+            n.extract()
+        parent = splitpoint.parent
+        while parent and parent.name != 'body':
+            for n in parent.findPreviousSiblings():
+                n.extract()
+            parent = parent.parent
+    return re.sub(r'( *\r?\n)+','\r\n',unicode(soup))
+def get_path_part(n):
+    relpath = os.path.dirname(n)
+    if( len(relpath) > 0 ):
+        relpath=relpath+"/"
+    return relpath
+def get_file_part(n):
+    return os.path.basename(n)
+## Utility method for creating new tags.
+def newTag(dom,name,attrs=None,text=None):
+    tag = dom.createElement(name)
+    if( attrs is not None ):
+        for attr in attrs.keys():
+            tag.setAttribute(attr,attrs[attr])
+    if( text is not None ):
+        tag.appendChild(dom.createTextNode(text))
+    return tag
+def main(argv,usage=None):
+    from optparse import OptionParser
+    if not usage:
+        # read in args, anything starting with -- will be treated as --<varible>=<value>
+        usage = 'usage: python %prog'
+    parser = OptionParser(usage+''' [options] <input epub> [line numbers...]
+Giving an epub without line numbers will return a list of line numbers: the
+possible split points in the input file. Calling with line numbers will
+generate an epub with each of the "lines" given included.''')
+    parser.add_option("-o", "--output", dest="outputopt", default="split.epub",
+                      help="Set OUTPUT file, Default: split.epub", metavar="OUTPUT")
+    parser.add_option("--output-dir", dest="outputdiropt",
+                      help="Set OUTPUT directory, Default: presend working directory")
+    parser.add_option('--split-by-section',
+                      action='store_true', dest='split_by_section',
+                      help='Create a new epub from each of the listed line sections instead of one containing all.  Splits all sections if no lines numbers are given. Each split will be named <number>-<output name> and placed in the output-dir.  Sections without a Table of Contents entry will be included with the preceding section(s)', )
+    parser.add_option("-t", "--title", dest="titleopt", default=None,
+                      help="Use TITLE as the metadata title.  Default: '<original epub title> Split' or ToC entry with --split-by-section", metavar="TITLE")
+    parser.add_option("-d", "--description", dest="descopt", default=None,
+                      help="Use DESC as the metadata description.  Default: 'Split from <epub title> by <author>'.", metavar="DESC")
+    parser.add_option("-a", "--author",
+                      action="append", dest="authoropts", default=[],
+                      help="Use AUTHOR as a metadata author, multiple authors may be given, Default: <All authors from original epub>", metavar="AUTHOR")
+    parser.add_option("-g", "--tag",
+                      action="append", dest="tagopts", default=[],
+                      help="Include TAG as dc:subject tag, multiple tags may be given, Default: None", metavar="TAG")
+    parser.add_option("-l", "--language",
+                      action="append", dest="languageopts", default=[],
+                      help="Include LANG as dc:language tag, multiple languages may be given, Default: en", metavar="LANG")
+    parser.add_option("-c", "--cover", dest="coveropt", default=None,
+                      help="Path to a jpg to use as cover image.", metavar="COVER")
+    (options, args) = parser.parse_args(argv)
+    ## Add .epub if not already there.
+    if not options.outputopt.lower().endswith(".epub"):
+        options.outputopt=options.outputopt+".epub"
+    if not options.languageopts:
+        options.languageopts = ['en']
+    if not args:
+        parser.print_help()
+        return
+    epubO = SplitEpub(args[0])
+    lines = epubO.get_split_lines()
+    if options.split_by_section:
+        if len(args) > 1:
+            section_lines = args[1:]
+        else:
+            section_lines = range(len(lines))
+        splitslist = []
+        sectionlist = []
+        title=None
+        for lineno in section_lines:
+            toclist = lines[int(lineno)]['toc']
+            if sectionlist and not toclist:
+                sectionlist.append(lineno)
+            else:
+                ## take title from (first) ToC if available, else titleopt (_ Split internally if None)
+                title = (toclist[0] if toclist else options.titleopt)
+                print("title: %s"%title)
+                sectionlist=[lineno]
+                splitslist.append((sectionlist,title))
+        if sectionlist:
+            splitslist.append((sectionlist,title))
+        # print(splitslist)
+        filecount = 1
+        for sectionlist, title in splitslist:
+            outputfile = "%0.4d-%s"%(filecount,options.outputopt)
+            if options.outputdiropt:
+                outputfile = os.path.join(options.outputdiropt,outputfile)
+            print("output file: "+outputfile)
+            epubO.write_split_epub(outputfile,
+                                   sectionlist,
+                                   authoropts=options.authoropts,
+                                   titleopt=title,
+                                   descopt=options.descopt,
+                                   tags=options.tagopts,
+                                   languages=options.languageopts,
+                                   coverjpgpath=options.coveropt)
+            filecount+=1
+        return
+    elif len(args) == 1:
+        count = 0
+        showlist=['toc','guide','anchor','id','href']
+        for line in lines:
+            print("\nLine Number: %d"%count)
+            for s in showlist:
+                if s in line and line[s]:
+                    print("\t%s: %s"%(s,line[s]))
+            count += 1
+        return
+    if len(args) > 1:
+        outputfile = options.outputopt
+        if options.outputdiropt:
+            outputfile = os.path.join(options.outputdiropt,outputfile)
+        print("output file: "+outputfile)
+        epubO.write_split_epub(outputfile,
+                               args[1:],
+                               authoropts=options.authoropts,
+                               titleopt=options.titleopt,
+                               descopt=options.descopt,
+                               tags=options.tagopts,
+                               languages=options.languageopts,
+                               coverjpgpath=options.coveropt)
+        return
+if __name__ == "__main__":
+    main(sys.argv[1:])

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+gradio
+requests
+beautifulsoup4
+ebooklib
+huggingface_hub
+datasets
+hf-transfer
+protobuf
+click
+pydantic
+torch
+uvicorn
+html5lib
+lxml