Spaces:

rosa0003
/

MindMap

Runtime error

App Files Files Community

rosa0003 commited on Oct 29, 2024

Commit

f867c73

verified ·

1 Parent(s): 066bc00

Upload 5 files

Browse files

Files changed (5) hide show

.gitignore +101 -0
LICENSE +21 -0
setup.py +37 -0
toc2mindmap/convert.py +92 -0
toc2mindmap/dumppdf.py +280 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,101 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# dotenv
+.env
+# virtualenv
+.venv
+venv/
+ENV/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2018 Lucas
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

setup.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python
+'''
+PdfToc2MindMap  setup
+Warnings:
+    to make pip respect the links, you have to use
+    `--process-dependency-links` switch. So e.g.:
+    `pip install --process-dependency-links <repo_path_or_url>`
+'''
+import setuptools
+# see http://setuptools.readthedocs.io/en/latest/setuptools.html
+# and https://packaging.python.org/tutorials/distributing-packages/
+setuptools.setup(
+    name='PdfToc2MindMap',
+    version='0.1.0a1',
+    install_requires=['pdfminer.six'],
+    dependency_links=[
+        'git+https://github.com/andrii-z4i/xmind-sdk-python.git@master-0'
+    ],
+    packages=setuptools.find_packages(exclude=('tests*',)), # find automatically
+    author='Lucas Koelman',
+    author_email='lucas.koelman@gmail.com',
+    description='PdfToc2MindMap: create mindmaps from table of contents in a PDF file',
+    long_description='PdfToc2MindMap: create mindmaps from table of contents in a PDF file',
+    license='MIT',
+    keywords=('pdf', 'mindmap', 'visualization'),
+    url='https://github.com/mananatee/PdfToc2MindMap',
+    classifiers=[
+        'Development Status :: 3 - Alpha',
+        'Environment :: Console',
+        'License :: OSI Approved :: MIT License',
+        'Topic :: Utilities',
+        'Programming Language :: Python :: 3',],
+    entry_points={},
+    package_data={})

toc2mindmap/convert.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#-*- coding: utf-8 -*-
+"""
+Module to convert table of contents of PDF file to mindmap format.
+@author     Lucas Koelman
+@date       20/02/2018
+@see        https://github.com/pdfminer/pdfminer.six/blob/master/tools/dumppdf.py
+@see        https://github.com/xmindltd/xmind-sdk-python/blob/master/example.py
+"""
+# standard library
+import sys, os, re
+import xml.etree.ElementTree as etree
+try:
+    import cStringIO as io # Python 2
+except (ImportError, ModuleNotFoundError):
+    import io # Python 3
+import dumppdf
+import xmind
+from xmind.core.topic import TopicElement
+def toc_to_xmind(outfp, pdf_filename):
+    """
+    Convert table of contents of given PDF file to XMind document.
+    """
+    out_str = io.StringIO()
+    dumppdf.dumpoutline(out_str, pdf_filename, [], set())
+    # Parse XML
+    toc_xml = out_str.getvalue()
+    out_str.close() # no 'with' statement possible
+    root_elem = etree.fromstring(toc_xml)
+    # Convert XML to XMind document
+    xwb = xmind.load(outfp) # load an existing file or create a new workbook if nothing is found
+    # Create XMind workbook
+    s1 = xwb.getPrimarySheet()
+    s1.setTitle(os.path.split(pdf_filename)[-1])
+    root_topic = s1.getRootTopic()
+    root_topic.setTitle("Contents")
+    # Transform each XML node into a mindmap node
+    topic_stack = [root_topic] # length will always equal depth/level during traversal
+    prev_level = 0
+    for node in root_elem.iter(): # depth-first traversal
+        if 'level' not in node.attrib:
+            continue # irrelevant node
+        node_level = int(node.attrib['level'])
+        # Create topic for this node
+        topic = TopicElement(ownerWorkbook=xwb)
+        title = re.sub(r"^[a-zA-Z]'(.*)'$", r'\1', node.attrib['title'])
+        topic.setTitle(title)
+        # Add it to the topic tree
+        level_difference = node_level - prev_level
+        for _ in range(-level_difference+1): # negative yields empty list
+            topic_stack.pop()
+        topic_stack[-1].addSubTopic(topic)
+        topic_stack.append(topic)
+        prev_level = node_level
+    xmind.save(xwb)
+def main(argv):
+    """
+    Run conversion tool from command line.
+    """
+    import getopt
+    def usage():
+        print ('usage: %s -o outfile.xmind pdf_file.pdf' % argv[0])
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'o:')
+    except getopt.GetoptError:
+        return usage()
+    if not args:
+        return usage()
+    dopts = dict(opts)
+    outfp = dopts['-o']
+    pdf_filename = args[0]
+    toc_to_xmind(outfp, pdf_filename)
+if __name__ == '__main__':
+    sys.exit(main(sys.argv))

toc2mindmap/dumppdf.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""
+This is the dumppdf script copied from
+https://github.com/pdfminer/pdfminer.six/tree/master/tools.
+The script was copied here since it is not part of an exposed package.
+"""
+#
+# dumppdf.py - dump pdf contents in XML format.
+#
+#  usage: dumppdf.py [options] [files ...]
+#  options:
+#    -i objid : object id
+#
+import sys, os.path, re, logging
+from pdfminer.psparser import PSKeyword, PSLiteral, LIT
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
+from pdfminer.pdftypes import PDFObjectNotFound, PDFValueError
+from pdfminer.pdftypes import PDFStream, PDFObjRef, resolve1, stream_value
+from pdfminer.pdfpage import PDFPage
+from pdfminer.utils import isnumber
+ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
+def e(s):
+    if six.PY3 and isinstance(s,six.binary_type):
+        s=str(s,'latin-1')
+    return ESC_PAT.sub(lambda m:'&#%d;' % ord(m.group(0)), s)
+import six # Python 2+3 compatibility
+# dumpxml
+def dumpxml(out, obj, codec=None):
+    if obj is None:
+        out.write('<null />')
+        return
+    if isinstance(obj, dict):
+        out.write('<dict size="%d">\n' % len(obj))
+        for (k,v) in six.iteritems(obj):
+            out.write('<key>%s</key>\n' % k)
+            out.write('<value>')
+            dumpxml(out, v)
+            out.write('</value>\n')
+        out.write('</dict>')
+        return
+    if isinstance(obj, list):
+        out.write('<list size="%d">\n' % len(obj))
+        for v in obj:
+            dumpxml(out, v)
+            out.write('\n')
+        out.write('</list>')
+        return
+    if isinstance(obj, (six.string_types, six.binary_type)):
+        out.write('<string size="%d">%s</string>' % (len(obj), e(obj)))
+        return
+    if isinstance(obj, PDFStream):
+        if codec == 'raw':
+            out.write(obj.get_rawdata())
+        elif codec == 'binary':
+            out.write(obj.get_data())
+        else:
+            out.write('<stream>\n<props>\n')
+            dumpxml(out, obj.attrs)
+            out.write('\n</props>\n')
+            if codec == 'text':
+                data = obj.get_data()
+                out.write('<data size="%d">%s</data>\n' % (len(data), e(data)))
+            out.write('</stream>')
+        return
+    if isinstance(obj, PDFObjRef):
+        out.write('<ref id="%d" />' % obj.objid)
+        return
+    if isinstance(obj, PSKeyword):
+        out.write('<keyword>%s</keyword>' % obj.name)
+        return
+    if isinstance(obj, PSLiteral):
+        out.write('<literal>%s</literal>' % obj.name)
+        return
+    if isnumber(obj):
+        out.write('<number>%s</number>' % obj)
+        return
+    raise TypeError(obj)
+# dumptrailers
+def dumptrailers(out, doc):
+    for xref in doc.xrefs:
+        out.write('<trailer>\n')
+        dumpxml(out, xref.trailer)
+        out.write('\n</trailer>\n\n')
+    return
+# dumpallobjs
+def dumpallobjs(out, doc, codec=None):
+    visited = set()
+    out.write('<pdf>')
+    for xref in doc.xrefs:
+        for objid in xref.get_objids():
+            if objid in visited: continue
+            visited.add(objid)
+            try:
+                obj = doc.getobj(objid)
+                if obj is None: continue
+                out.write('<object id="%d">\n' % objid)
+                dumpxml(out, obj, codec=codec)
+                out.write('\n</object>\n\n')
+            except PDFObjectNotFound as e:
+                print >>sys.stderr, 'not found: %r' % e
+    dumptrailers(out, doc)
+    out.write('</pdf>')
+    return
+# dumpoutline
+def dumpoutline(outfp, fname, objids, pagenos, password='',
+                dumpall=False, codec=None, extractdir=None):
+    fp = open(fname, 'rb')
+    parser = PDFParser(fp)
+    doc = PDFDocument(parser, password)
+    pages = dict( (page.pageid, pageno) for (pageno,page)
+                  in enumerate(PDFPage.create_pages(doc), 1) )
+    def resolve_dest(dest):
+        if isinstance(dest, str):
+            dest = resolve1(doc.get_dest(dest))
+        elif isinstance(dest, PSLiteral):
+            dest = resolve1(doc.get_dest(dest.name))
+        if isinstance(dest, dict):
+            dest = dest['D']
+        if isinstance(dest, PDFObjRef):
+            dest = dest.resolve()
+        return dest
+    try:
+        outlines = doc.get_outlines()
+        outfp.write('<outlines>\n')
+        for (level,title,dest,a,se) in outlines:
+            pageno = None
+            if dest:
+                dest = resolve_dest(dest)
+                pageno = pages[dest[0].objid]
+            elif a:
+                action = a
+                if isinstance(action, dict):
+                    subtype = action.get('S')
+                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get('D'):
+                        dest = resolve_dest(action['D'])
+                        pageno = pages[dest[0].objid]
+            s = e(title).encode('utf-8', 'xmlcharrefreplace')
+            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
+            if dest is not None:
+                outfp.write('<dest>')
+                dumpxml(outfp, dest)
+                outfp.write('</dest>\n')
+            if pageno is not None:
+                outfp.write('<pageno>%r</pageno>\n' % pageno)
+            outfp.write('</outline>\n')
+        outfp.write('</outlines>\n')
+    except PDFNoOutlines:
+        pass
+    parser.close()
+    fp.close()
+    return
+# extractembedded
+LITERAL_FILESPEC = LIT('Filespec')
+LITERAL_EMBEDDEDFILE = LIT('EmbeddedFile')
+def extractembedded(outfp, fname, objids, pagenos, password='',
+                    dumpall=False, codec=None, extractdir=None):
+    def extract1(obj):
+        filename = os.path.basename(obj['UF'] or obj['F'])
+        fileref = obj['EF']['F']
+        fileobj = doc.getobj(fileref.objid)
+        if not isinstance(fileobj, PDFStream):
+            raise PDFValueError(
+                'unable to process PDF: reference for %r is not a PDFStream' %
+                (filename))
+        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
+            raise PDFValueError(
+                'unable to process PDF: reference for %r is not an EmbeddedFile' %
+                (filename))
+        path = os.path.join(extractdir, filename)
+        if os.path.exists(path):
+            raise IOError('file exists: %r' % path)
+        print >>sys.stderr, 'extracting: %r' % path
+        out = file(path, 'wb')
+        out.write(fileobj.get_data())
+        out.close()
+        return
+    fp = open(fname, 'rb')
+    parser = PDFParser(fp)
+    doc = PDFDocument(parser, password)
+    for xref in doc.xrefs:
+        for objid in xref.get_objids():
+            obj = doc.getobj(objid)
+            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
+                extract1(obj)
+    fp.close()
+    return
+# dumppdf
+def dumppdf(outfp, fname, objids, pagenos, password='',
+            dumpall=False, codec=None, extractdir=None):
+    fp = open(fname, 'rb')
+    parser = PDFParser(fp)
+    doc = PDFDocument(parser, password)
+    if objids:
+        for objid in objids:
+            obj = doc.getobj(objid)
+            dumpxml(outfp, obj, codec=codec)
+    if pagenos:
+        for (pageno,page) in enumerate(PDFPage.create_pages(doc)):
+            if pageno in pagenos:
+                if codec:
+                    for obj in page.contents:
+                        obj = stream_value(obj)
+                        dumpxml(outfp, obj, codec=codec)
+                else:
+                    dumpxml(outfp, page.attrs)
+    if dumpall:
+        dumpallobjs(outfp, doc, codec=codec)
+    if (not objids) and (not pagenos) and (not dumpall):
+        dumptrailers(outfp, doc)
+    fp.close()
+    if codec not in ('raw','binary'):
+        outfp.write('\n')
+    return
+# main
+def main(argv):
+    import getopt
+    def usage():
+        print ('usage: %s [-d] [-a] [-p pageid] [-P password] [-r|-b|-t] [-T] [-E directory] [-i objid] file ...' % argv[0])
+        return 100
+    try:
+        (opts, args) = getopt.getopt(argv[1:], 'dap:P:rbtTE:i:o:')
+    except getopt.GetoptError:
+        return usage()
+    if not args: return usage()
+    objids = []
+    pagenos = set()
+    codec = None
+    password = ''
+    dumpall = False
+    proc = dumppdf
+    outfp = sys.stdout
+    extractdir = None
+    for (k, v) in opts:
+        if k == '-d': logging.getLogger().setLevel(logging.DEBUG)
+        elif k == '-o': outfp = open(v, 'w')
+        elif k == '-i': objids.extend( int(x) for x in v.split(',') )
+        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
+        elif k == '-P': password = v
+        elif k == '-a': dumpall = True
+        elif k == '-r': codec = 'raw'
+        elif k == '-b': codec = 'binary'
+        elif k == '-t': codec = 'text'
+        elif k == '-T': proc = dumpoutline
+        elif k == '-E':
+            extractdir = v
+            proc = extractembedded
+    if six.PY2 and sys.stdin.encoding:
+        password = password.decode(sys.stdin.encoding)
+    for fname in args:
+        proc(outfp, fname, objids, pagenos, password=password,
+             dumpall=dumpall, codec=codec, extractdir=extractdir)
+    outfp.close()
+if __name__ == '__main__': sys.exit(main(sys.argv))