diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..2a7eb07b981a460bd4c198fb917a30587a7d01e2 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+spec/files/hardmode.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a9213a96c5f0c9ce6fa8304e0528da6d5b5f7125
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,14 @@
+dist
+__pycache__
+*.egg-info
+*.aux
+*.dvi
+*.fdb_latexmk
+*.fls
+*.log
+*.out
+
+# User files
+*.pdf
+*.toc
+recipe_debug.toml
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000000000000000000000000000000000000..f4f7dc13b324932a8b6d4424cb5daf0b1ede749f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,120 @@
+Change log
+==========
+
+pdf.tocgen 1.3.4
+----------------
+
+Released November 25, 2023
+
+- Add error messages for `--page` and invalid file
+- Fix KeyError when extracting ToC from some PDFs with pdftocio
+
+pdf.tocgen 1.3.3
+----------------
+
+Released April 21, 2023
+
+- Fix outdated dependencies
+- Add vpos output for pdftocio
+- Type stability enhancements
+
+pdf.tocgen 1.3.2
+----------------
+
+Released April 20, 2023
+
+- Fix outdated build system
+
+pdf.tocgen 1.3.1
+----------------
+
+Released April 20, 2023
+
+- Fix file encoding problems on Windows
+
+pdf.tocgen 1.3.0
+----------------
+
+Released November 10, 2021
+
+- Fix deprecation warning from PyMuPDF
+
+pdf.tocgen 1.2.3
+----------------
+
+Released January 7, 2021
+
+- Compatibility with PyMuPDF 1.18.6
+
+pdf.tocgen 1.2.2
+----------------
+
+Released October 11, 2020
+
+- Compatibility with Python 3.9
+
+pdf.tocgen 1.2.1
+----------------
+
+Released August 7, 2020
+
+- Fix a typo in the help message of `pdftocgen`.
+
+pdf.tocgen 1.2.0
+----------------
+
+Released August 7, 2020
+
+- Swap out argparse in favor of getopt, which is much simpler and more
+  flexible.
+- Now we could use `pdfxmeta doc.pdf` to dump an entire document, without the
+  empty pattern `""`.
+
+pdf.tocgen 1.1.3
+----------------
+
+Released August 4, 2020
+
+- Usefully complain when tocparser can't parse an entry
+
+pdf.tocgen 1.1.2
+----------------
+
+Released August 3, 2020
+
+- Add `--print` flag for `pdftocio` to force printing ToC.
+- Add spec for cli commands.
+
+pdf.tocgen 1.1.1
+----------------
+
+Released July 31, 2020
+
+- Add a `--auto` option for `pdfxmeta` to output a valid heading filter directly.
+
+pdf.tocgen 1.1.0
+----------------
+
+Released July 31, 2020
+
+- Add a new option for a heading filter to be "greedy", which makes it extract
+  all the text in a block when at least one match occurs. This is extremely
+  useful for math-heavy documents.
+- fixes the sorting problem with two column layout.
+
+pdf.tocgen 1.0.1
+----------------
+
+Released July 29, 2020
+
+- Update documentations
+- Fix some linter warnings
+- Fix unicode problem in tests
+- Some prep work for the next major release
+
+pdf.tocgen 1.0.0
+----------------
+
+Released July 28, 2020
+
+- The first stable version
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..3877ae0a7ff6f94ac222fd704e112723db776114
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/LICENSE_AGPL b/LICENSE_AGPL
new file mode 100644
index 0000000000000000000000000000000000000000..1468d07c88d6a48dae9360ed0094955b54370224
--- /dev/null
+++ b/LICENSE_AGPL
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..c2b5806d4fbd99f02a757abcfeb728c70dafebc6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,26 @@
+# As a workaround to [1], we will use a makefile instead
+# [1]: https://github.com/python-poetry/poetry/issues/241
+
+.PHONY: install test xmeta-demo lint
+
+test: # run tests
+	@poetry run mamba --format=documentation ./spec
+	@poetry run ./spec/cli_spec.sh
+
+lint: # run lint
+	@poetry run pylint ./spec ./pdfxmeta ./pdftocgen ./fitzutils ./pdftocio
+
+xmeta-demo: # a demo of pdfxmeta
+	@poetry run pdfxmeta ./spec/files/level2.pdf "Section"
+
+tocgen-demo: # a demo of tocgen
+	@poetry run pdftocgen ./spec/files/level2.pdf < ./recipes/default_latex.toml
+
+install: # set up non-dev dependencies
+	poetry install --no-dev
+
+dev: # set up dev dependencies
+	poetry install
+
+publish: test # publish package to pypi
+	poetry publish --build
diff --git a/QUICK_START.md b/QUICK_START.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad0525b101289d221c0ba72cd2a6dd2612bf816a
--- /dev/null
+++ b/QUICK_START.md
@@ -0,0 +1,62 @@
+# PDF ToC Generation Quick Start
+
+Optional:  Run as App
+```bash
+streamlit run app.py
+```
+This will open a local web page where you can upload a PDF, analyze fonts, and generate bookmarks with one click.
+
+### Find Header Candidates
+If you don't know the font size/name of your chapters, this lists the top 25 largest text elements.
+```bash
+python utils/list_longest_fonts.py <input.pdf>
+```
+*Output: Font Name, Size, Physical Page, Logical Page Label.*
+
+### Find Header by Context
+If you know a specific string (e.g., the first sentence of a chapter) but can't find the header itself, this finds the element *immediately preceding* that string.
+```bash
+python utils/find_preceding.py <input.pdf> "known text string"
+```
+
+### Debug Text Artifacts
+If your bookmarks have weird characters (e.g., `??`), use this to see the raw byte codes (looking for soft hyphens `\xad`, non-breaking spaces `\xa0`, etc.).
+```bash
+python utils/inspect_bytes.py <input.pdf> "Problematic String"
+```
+
+---
+
+## Recipe Generation (pdfxmeta)
+Once you have identified the visual style of your headers (e.g., "Caslon 54pt"), you can inspect specific text or automatically generate recipe entries using `pdfxmeta`.
+
+### Inspect Font Details
+To get the exact font name and size of a specific string (e.g., "Chapter 1"):
+```bash
+pdfxmeta input.pdf "Chapter 1"
+```
+*Output will show `font.name`, `font.size`, etc.*
+
+### Auto-Generate Recipe Entry
+To append a valid TOML filter directly to your recipe file (level 1 header):
+```bash
+pdfxmeta -a 1 input.pdf "Chapter 1" >> recipe.toml
+```
+
+---
+
+## The Pipeline
+Run the full extraction and generation pipeline.
+
+### Middleware: `modify_toc.py`
+We use a custom Python script to:
+1.  **Sanitize Text**: Removes soft hyphens (`\xad`) and cleans encodings.
+2.  **Format Labels**: Renames bookmarks to `001_Title_pgX`.
+3.  **Fix Encoding**: Forces UTF-8 handling to prevent pipe corruption.
+
+### The Command
+**Git Bash** is recommended to avoid PowerShell encoding issues.
+
+```bash
+pdftocgen -r recipe.toml input.pdf | python utils/modify_toc.py | pdftocio -o output.pdf input.pdf
+```
diff --git a/README b/README
new file mode 100644
index 0000000000000000000000000000000000000000..d33278e0aa2685d91d56f4d45399e21d5d2afede
--- /dev/null
+++ b/README
@@ -0,0 +1,214 @@
+pdf.tocgen
+==========
+
+                          in.pdf
+                            |
+                            |
+     +----------------------+--------------------+
+     |                      |                    |
+     V                      V                    V
++----------+          +-----------+         +----------+
+|          |  recipe  |           |   ToC   |          |
+| pdfxmeta +--------->| pdftocgen +-------->| pdftocio +---> out.pdf
+|          |          |           |         |          |
++----------+          +-----------+         +----------+
+
+pdf.tocgen is a set of command-line tools for automatically
+extracting and generating the table of contents (ToC) of a
+PDF file. It uses the embedded font attributes and position
+of headings to deduce the basic outline of a PDF file.
+
+It works best for PDF files produces from a TeX document
+using pdftex (and its friends pdflatex, pdfxetex, etc.), but
+it's designed to work with any *software-generated* PDF
+files (i.e. you shouldn't expect it to work with scanned
+PDFs). Some examples include troff/groff, Adobe InDesign,
+Microsoft Word, and probably more.
+
+Please see the homepage [1] for a detailed introduction.
+
+Installation
+------------
+
+pdf.tocgen is written in Python 3. It is known to work with
+Python 3.7 to 3.11 on Linux, Windows, and macOS (On BSDs,
+you probably need to build PyMuPDF yourself). Use
+
+    $ pip install -U pdf.tocgen
+
+to install the latest version systemwide. Alternatively, use
+`pipx` or
+
+    $ pip install -U --user pdf.tocgen
+
+to install it for the current user. I would recommend the
+latter approach to avoid messing up the package manager on
+your system.
+
+If you are using an Arch-based Linux distro, the package is
+also available on AUR [8]. It can be installed using any AUR
+helper, for example yay:
+
+    $ yay -S pdf.tocgen
+
+Workflow
+--------
+
+The design of pdf.tocgen is influenced by the Unix philosophy [2].
+I intentionally separated pdf.tocgen to 3 separate programs.
+They work together, but each of them is useful on their own.
+
+1. pdfxmeta: extract the metadata (font attributes, positions)
+             of headings to build a *recipe* file.
+2. pdftocgen: generate a table of contents from the recipe.
+3. pdftocio: import the table of contents to the PDF document.
+
+You should read the example [3] on the homepage for a proper
+introduction, but the basic workflow follows like this.
+
+First, use pdfxmeta to search for the metadata of headings,
+and generate *heading filters* using the automatic setting
+
+    $ pdfxmeta -p page -a 1 in.pdf "Section" >> recipe.toml
+    $ pdfxmeta -p page -a 2 in.pdf "Subsection" >> recipe.toml
+
+Note that `page` needs to be replaced by the page number of
+the search keyword.
+
+The output `recipe.toml` file would contain several heading
+filters, each of which specifies the attribute of a heading
+at a particular level should have.
+
+An example recipe file would look like this:
+
+    [[heading]]
+    level = 1
+    greedy = true
+    font.name = "Times-Bold"
+    font.size = 19.92530059814453
+
+    [[heading]]
+    level = 2
+    greedy = true
+    font.name = "Times-Bold"
+    font.size = 11.9552001953125
+
+Then pass the recipe to `pdftocgen` to generate a table of
+contents,
+
+    $ pdftocgen in.pdf < recipe.toml
+    "Preface" 5
+        "Bottom-up Design" 5
+        "Plan of the Book" 7
+        "Examples" 9
+        "Acknowledgements" 9
+    "Contents" 11
+    "The Extensible Language" 14
+        "1.1 Design by Evolution" 14
+        "1.2 Programming Bottom-Up" 16
+        "1.3 Extensible Software" 18
+        "1.4 Extending Lisp" 19
+        "1.5 Why Lisp (or When)" 21
+    "Functions" 22
+        "2.1 Functions as Data" 22
+        "2.2 Defining Functions" 23
+        "2.3 Functional Arguments" 26
+        "2.4 Functions as Properties" 28
+        "2.5 Scope" 29
+        "2.6 Closures" 30
+        "2.7 Local Functions" 34
+        "2.8 Tail-Recursion" 35
+        "2.9 Compilation" 37
+        "2.10 Functions from Lists" 40
+    "Functional Programming" 41
+        "3.1 Functional Design" 41
+        "3.2 Imperative Outside-In" 46
+        "3.3 Functional Interfaces" 48
+        "3.4 Interactive Programming" 50
+    [--snip--]
+
+which can be directly imported to the PDF file using
+`pdftocio`,
+
+    $ pdftocgen in.pdf < recipe.toml | pdftocio -o out.pdf in.pdf
+
+Or if you want to edit the table of contents before
+importing it,
+
+    $ pdftocgen in.pdf < recipe.toml > toc
+    $ vim toc # edit
+    $ pdftocio in.pdf < toc
+
+Each of the three programs has some extra functionalities.
+Use the -h option to see all the options you could pass in.
+
+Development
+-----------
+
+If you want to modify the source code or contribute anything,
+first install poetry [4], which is a dependency and package
+manager for Python used by pdf.tocgen. Then run
+
+    $ poetry install
+
+in the root directory of this repository to set up
+development dependencies.
+
+If you want to test the development version of pdf.tocgen,
+use the `poetry run` command:
+
+    $ poetry run pdfxmeta in.pdf "pattern"
+
+Alternatively, you could also use the
+
+    $ poetry shell
+
+command to open up a virtual environment and run the
+development version directly:
+
+    (pdf.tocgen) $ pdfxmeta in.pdf "pattern"
+
+Before you send a patch or pull request, make sure the unit
+test passes by running:
+
+    $ make test
+
+GUI front end
+-------------
+
+If you are a Emacs user, you could install Daniel Nicolai's
+toc-mode [9] package as a GUI front end for pdf.tocgen,
+though it offers many more functionalities, such as
+extracting (printed) table of contents from a PDF file. Note
+that it uses pdf.tocgen under the hood, so you still need to
+install pdf.tocgen before using toc-mode as a front end for
+pdf.tocgen.
+
+License
+-------
+
+pdf.tocgen itself a is free software. The source code of
+pdf.tocgen is licensed under the GNU GPLv3 license. However,
+the recipes in the `recipes` directory is separately
+licensed under the CC BY-NC-SA 4.0 License [7] to prevent
+any commercial usage, and thus not included in the
+distribution.
+
+pdf.tocgen is based on PyMuPDF [5], licensed under the GNU
+GPLv3 license, which is again based on MuPDF [6], licensed
+under the GNU AGPLv3 license. A copy of the AGPLv3 license
+is included in the repository.
+
+If you want to make any derivatives based on this project,
+please follow the terms of the GNU GPLv3 license.
+
+
+[1]: https://krasjet.com/voice/pdf.tocgen/
+[2]: https://en.wikipedia.org/wiki/Unix_philosophy
+[3]: https://krasjet.com/voice/pdf.tocgen/#a-worked-example
+[4]: https://python-poetry.org/
+[5]: https://github.com/pymupdf/PyMuPDF
+[6]: https://mupdf.com/docs/index.html
+[7]: https://creativecommons.org/licenses/by-nc-sa/4.0/
+[8]: https://aur.archlinux.org/packages/pdf.tocgen/
+[9]: https://github.com/dalanicolai/toc-mode
diff --git a/README.md b/README.md
index 8311edac1387771581380451f604af19646ca8c1..19bb3b63b5704e3db46c547078f6291b4fac69d5 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,30 @@
----
-title: Pdf.tocgen.split
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Split PDF by headings based on Krasjet pdf.tocgen
-license: gpl-2.0
----
-
-# Welcome to Streamlit!
-
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
+---
+title: PDF TOC Generator Split
+emoji: 📑
+colorFrom: blue
+colorTo: indigo
+sdk: streamlit
+sdk_version: 1.41.1
+app_file: app.py
+pinned: false
+license: agpl-3.0
+short_description: Generate PDF Table of Contents and Split Chapters
+---
+
+# PDF Table of Contents Generator (Split Edition)
+
+Based on [pdf.tocgen](https://github.com/Krasjet/pdf.tocgen).
+
+## Features
+-   **Analyze Fonts**: Automatically detect chapter headers by font size and style.
+-   **Search**: Find headers by text search (Case Sensitive option available).
+-   **Generate TOC**: Create a clickable PDF bookmark outline.
+-   **Split Chapters**: Export each chapter as a separate PDF in a ZIP file.
+-   **Front/Back Matter**: Automatically handle un-numbered front matter and user-defined back matter (Index, Glossary).
+
+## Usage
+1.  Upload a PDF.
+2.  Use "Scan & Generate" to find headers.
+3.  Configure the "Back Matter" start page if needed.
+4.  Run Pipeline.
+5.  Download the Bookmarked PDF or the Zipped Chapter Splits.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f9a98ba5df1d5ba05d922691738ff2e9776227c
--- /dev/null
+++ b/app.py
@@ -0,0 +1,381 @@
+import streamlit as st
+import pandas as pd
+import fitz  # PyMuPDF
+import os
+import subprocess
+import tempfile
+import sys
+import toml
+import shutil
+import zipfile
+import io
+
+# Ensure we can import from utils if needed
+sys.path.append(os.path.dirname(__file__))
+from utils import toc_processor
+from pdfxmeta import pdfxmeta
+
+st.set_page_config(page_title="PDF Bookmark Generator", layout="wide")
+
+st.title("PDF Table of Contents Generator")
+
+st.markdown("""
+**Upload a PDF**, analyze its fonts to find headers, and generate a clean Table of Contents.
+""")
+
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+
+if uploaded_file is not None:
+    # We need to save the uploaded file to disk for the CLI tools to read it
+    # We'll use a permanent temp file for the session so we don't have to re-upload constantly
+    # But for cleanliness, we might want to put this in a temp dir too?
+    # For now, keeping the input file logic as is (tempfile), but we'll put OUTPUTS in a pure temp dir
+    
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
+        tmp_pdf.write(uploaded_file.getvalue())
+        input_pdf_path = tmp_pdf.name
+
+    # --- State Management & Reset ---
+    # Check if a new file is uploaded
+    file_id = f"{uploaded_file.name}_{uploaded_file.size}" # Robust proxy for ID
+    if 'current_file_id' not in st.session_state:
+        st.session_state['current_file_id'] = None
+        
+    if st.session_state['current_file_id'] != file_id:
+        # NEW FILE DETECTED: Reset Pipeline State
+        keys_to_reset = ['final_pdf_bytes', 'final_zip_bytes', 'final_zip_name', 'search_matches', 'font_name', 'font_size']
+        for k in keys_to_reset:
+            if k in st.session_state:
+                del st.session_state[k]
+        st.session_state['current_file_id'] = file_id
+        # st.toast(f"New file loaded: {uploaded_file.name}. State cleared.")
+
+    st.success(f"Loaded: {uploaded_file.name}")
+
+    # --- Data Source Selection ---
+    st.header("1. Source Selection")
+    source_mode = st.radio("Where should the bookmarks come from?", 
+             ["Scan & Generate (Create New)", "Use Existing Bookmarks (Modify)"],
+             help="Choose 'Scan & Generate' to build new bookmarks from fonts. Choose 'Use Existing' to tidy up bookmarks already in the file.")
+
+    # --- Analysis Section (Only for Generate) ---
+    if source_mode == "Scan & Generate (Create New)":
+        st.header("2. Analyze Fonts")
+        
+        if 'font_name' not in st.session_state:
+            st.session_state['font_name'] = ''
+        if 'font_size' not in st.session_state:
+            st.session_state['font_size'] = 18.0
+            
+        tab1, tab2 = st.tabs(["Scan for Large Fonts", "Search by Text"])
+        
+        with tab1:
+            if st.button("Find Header Candidates"):
+                with st.spinner("Scanning PDF for large fonts..."):
+                    doc = fitz.open(input_pdf_path)
+                    candidates = []
+                    for page in doc[:50]:
+                        text_page = page.get_text("dict")
+                        for block in text_page["blocks"]:
+                            for line in block.get("lines", []):
+                                for span in line["spans"]:
+                                    text = span["text"].strip()
+                                    if len(text) > 3:
+                                        candidates.append({
+                                            "Text": text[:50],
+                                            "Font": span["font"],
+                                            "Size": round(span["size"], 2),
+                                            "Page": page.number + 1
+                                        })
+                    doc.close()
+                    if candidates:
+                        df = pd.DataFrame(candidates)
+                        summary = df.groupby(['Font', 'Size']).size().reset_index(name='Count')
+                        summary = summary.sort_values(by=['Size', 'Count'], ascending=[False, False]).head(20)
+                        st.session_state['scan_results'] = summary
+                    else:
+                        st.warning("No text found.")
+            
+            if 'scan_results' in st.session_state:
+                st.write("### Top Large Fonts Found")
+                st.dataframe(st.session_state['scan_results'], use_container_width=True)
+                
+                def update_from_scan():
+                    val = st.session_state.scan_selector
+                    if val:
+                        f_name = val.split(" (")[0]
+                        f_size = float(val.split("(")[1].replace("pt)", ""))
+                        st.session_state['font_name'] = f_name
+                        st.session_state['font_size'] = f_size
+
+                options = st.session_state['scan_results'].apply(lambda x: f"{x['Font']} ({x['Size']}pt)", axis=1)
+                st.selectbox("Select extraction font:", options, key='scan_selector', on_change=update_from_scan, index=None, placeholder="Choose a font...")
+
+        with tab2:
+            search_query = st.text_input("Enter text to find (e.g., 'Chapter 1')", "")
+            
+            c1, c2 = st.columns([1, 3])
+            with c1:
+                do_search = st.button("Search Text")
+            with c2:
+                is_case_sensitive = st.checkbox("Case Sensitive", value=False)
+
+            if do_search:
+                with st.spinner(f"Searching for '{search_query}'..."):
+                    # Use the robust pdfxmeta library
+                    try:
+                        doc = fitz.open(input_pdf_path)
+                        # pdfxmeta expects a regex pattern, so we escape the query to be safe
+                        import re
+                        safe_pattern = re.escape(search_query)
+                        
+                        # extract_meta returns a list of dicts (spans)
+                        results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not is_case_sensitive))
+                        doc.close()
+                        
+                        matches = []
+                        for res in results:
+                            matches.append({
+                                "Text": res.get("text", "").strip(),
+                                "Font": res.get("font", ""),
+                                "Size": round(res.get("size", 0), 2),
+                                "Page": res.get("page_index", 0)
+                            })
+                            # Limit for display safety
+                            if len(matches) > 50: break
+                            
+                        if matches:
+                            st.session_state['search_matches'] = pd.DataFrame(matches)
+                        else:
+                            st.warning("No matches found.")
+                            
+                    except Exception as e:
+                        st.error(f"Search failed: {e}")
+            
+            if 'search_matches' in st.session_state:
+                st.write(f"### Found Matches")
+                st.dataframe(st.session_state['search_matches'], use_container_width=True)
+                
+                def update_from_search():
+                    val = st.session_state.search_selector
+                    if val:
+                        parts = val.split(" (")
+                        f_name = parts[0]
+                        f_size = float(parts[1].split("pt)")[0])
+                        st.session_state['font_name'] = f_name
+                        st.session_state['font_size'] = f_size
+
+                options = st.session_state['search_matches'].apply(lambda x: f"{x['Font']} ({x['Size']}pt) - Pg {x['Page']}", axis=1)
+                st.selectbox("Select font from match:", options, key='search_selector', on_change=update_from_search, index=None, placeholder="Choose a match...")
+
+        # --- Configuration (Only for Generate) ---
+        st.header("3. Configure Recipe")
+        col1, col2 = st.columns(2)
+        with col1:
+            font_name_input = st.text_input("Font Name", key='font_name')
+        with col2:
+            font_size_input = st.number_input("Font Size", key='font_size')
+        
+        greedy = st.checkbox("Greedy Match (Merge multiline specs)", value=True)
+        
+        # --- Back Matter Configuration ---
+        with st.expander("Back Matter Configuration (Optional)", expanded=False):
+            st.markdown("Identify where the **Back Matter** (Index, Glossary, etc.) starts to split it into a separate `999_Back_matter.pdf`.")
+            
+            # Independent Search for Back Matter
+            bm_query = st.text_input("Find Back Matter start (e.g., 'Index')", key="bm_search_query")
+            
+            c_bm1, c_bm2 = st.columns([1, 3])
+            with c_bm1:
+                 do_bm_search = st.button("Search Back Matter")
+            with c_bm2:
+                 bm_case_sensitive = st.checkbox("Case Sensitive", key="bm_sens", value=False)
+                 
+            if do_bm_search:
+                with st.spinner("Searching..."):
+                    try:
+                        doc = fitz.open(input_pdf_path)
+                        import re
+                        safe_pattern = re.escape(bm_query)
+                        results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not bm_case_sensitive))
+                        doc.close()
+                        
+                        bm_matches = []
+                        for res in results:
+                            bm_matches.append({
+                                "Text": res.get("text", "").strip(), 
+                                "Page": res.get("page_index", 0) # Display raw (already 1-based from pdfxmeta)
+                            })
+                            if len(bm_matches) > 50: break
+                        
+                        if bm_matches:
+                            st.session_state['bm_matches'] = pd.DataFrame(bm_matches)
+                        else:
+                            st.warning("No matches found.")
+                    except Exception as e:
+                        st.error(f"Search failed: {e}")
+
+            if 'bm_matches' in st.session_state:
+                st.dataframe(st.session_state['bm_matches'], use_container_width=True)
+                
+                def update_bm_page():
+                    val = st.session_state.bm_selector
+                    if val:
+                        # Value format: "Page X - Text..."
+                        page_num = int(val.split(" -")[0].replace("Page ", ""))
+                        st.session_state['back_matter_page'] = page_num
+                
+                bm_options = st.session_state['bm_matches'].apply(lambda x: f"Page {x['Page']} - {x['Text'][:30]}...", axis=1)
+                st.selectbox("Select Start Page:", bm_options, key='bm_selector', on_change=update_bm_page, index=None, placeholder="Select start page...")
+
+            # Manual Override
+            # Update session state when this input changes
+            def update_manual_bm():
+                st.session_state['back_matter_page'] = st.session_state.back_matter_page_manual
+                
+            st.number_input("Or manually set Start Page:", min_value=0, value=st.session_state.get('back_matter_page', 0), key='back_matter_page_manual', on_change=update_manual_bm)
+
+    else:
+        # Existing Mode
+        st.info("Using existing bookmarks. They will be cleaned, numbered, and used for splitting/downloading.")
+
+    # --- Generation ---
+    st.header("4. Process & Generate")
+    
+    if st.button("Run Pipeline"):
+        # Validate inputs if generating
+        if source_mode == "Scan & Generate (Create New)" and not st.session_state.get('font_name'):
+            st.error("Please specify a font name for extraction.")
+        else:
+            with st.status("Running pipeline tasks...", expanded=True) as status:
+                # Use a temporary directory for all intermediate files
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    status.write(f"Created temp workspace: {temp_dir}")
+                    
+                    # Paths
+                    recipe_path = os.path.join(temp_dir, "recipe.toml")
+                    raw_toc_path = os.path.join(temp_dir, "raw.toc") # pdftocgen output
+                    clean_toc_path = os.path.join(temp_dir, "clean.toc") # modify_toc output
+                    output_pdf_path = os.path.join(temp_dir, "final.pdf")
+                    
+                    raw_toc_content = ""
+
+                    if source_mode == "Scan & Generate (Create New)":
+                        # 1. Create Recipe
+                        recipe_data = {
+                            "heading": [{
+                                "level": 1,
+                                "greedy": greedy,
+                                "font": {
+                                    "name": st.session_state['font_name'],
+                                    "size": st.session_state['font_size'],
+                                    "size_tolerance": 0.1
+                                }
+                            }]
+                        }
+                        with open(recipe_path, "w") as f:
+                            toml.dump(recipe_data, f)
+                        status.write("✅ Recipe created")
+                        
+                        # 2. Run pdftocgen -> raw.toc
+                        status.write("Running pdftocgen (Scanning)...")
+                        cmd1 = f'pdftocgen -r "{recipe_path}" "{input_pdf_path}"'
+                        process = subprocess.run(cmd1, shell=True, capture_output=True, text=True, encoding='utf-8')
+                        if process.returncode != 0:
+                            st.error(f"pdftocgen failed: {process.stderr}")
+                            st.stop()
+                        raw_toc_content = process.stdout
+                        status.write("✅ Headers extracted")
+
+                    else:
+                        # Existing Bookmarks
+                        status.write("Extracting existing bookmarks...")
+                        # Run pdftocio in extract mode
+                        cmd1 = f'pdftocio "{input_pdf_path}"'
+                        process = subprocess.run(cmd1, shell=True, capture_output=True, text=True, encoding='utf-8')
+                        if process.returncode != 0:
+                            st.error(f"pdftocio failed: {process.stderr}")
+                            st.stop()
+                        raw_toc_content = process.stdout
+                        if not raw_toc_content.strip():
+                            st.warning("No existing bookmarks found!")
+                            st.stop()
+                        status.write("✅ Existing bookmarks imported")
+                    
+                    # 3. Clean Content (Using centralized utility)
+                    status.write("Cleaning and merging bookmarks...")
+                    cleaned_toc_content = toc_processor.process_toc(raw_toc_content)
+                    
+                    with open(clean_toc_path, "w", encoding='utf-8') as f:
+                        f.write(cleaned_toc_content)
+                    status.write("✅ Bookmarks formatted (Double-splits fixed)")
+                    
+                    # 4. Write PDF
+                    status.write("Writing to PDF...")
+                    cmd3 = f'pdftocio -t "{clean_toc_path}" -o "{output_pdf_path}" "{input_pdf_path}"'
+                    process = subprocess.run(cmd3, shell=True, capture_output=True, text=True)
+                    if process.returncode != 0:
+                        st.error(f"pdftocio failed: {process.stderr}")
+                        st.stop()
+                    status.write("✅ PDF saved")
+                    
+                    # 5. Read Result for Download
+                    with open(output_pdf_path, "rb") as f:
+                        st.session_state['final_pdf_bytes'] = f.read()
+                    
+                    # 6. Split & Zip (The Feature)
+                    # Use a temp file for the zip to avoid memory issues
+                    with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp_zip:
+                        tmp_zip_path = tmp_zip.name
+                    
+                    try:
+                        # Pass back_matter_page if it exists and is valid
+                        bm_page = st.session_state.get('back_matter_page', 0)
+                        if bm_page == 0: bm_page = None
+                        
+                        toc_processor.generate_chapter_splits(output_pdf_path, tmp_zip_path, back_matter_start_page=bm_page)
+                        
+                        with open(tmp_zip_path, "rb") as f:
+                            st.session_state['final_zip_bytes'] = f.read()
+                            
+                        base_name = os.path.splitext(uploaded_file.name)[0]
+                        st.session_state['final_zip_name'] = f"{base_name}_chapters.zip"
+                        
+                    except Exception as e:
+                        st.error(f"Error generating zip: {e}")
+                    finally:
+                        if os.path.exists(tmp_zip_path):
+                            os.unlink(tmp_zip_path)
+
+    # --- Persistent Download Area ---
+    if 'final_pdf_bytes' in st.session_state:
+        st.success("Pipeline completed successfully!")
+        st.write("### Downloads")
+        
+        c_dl1, c_dl2 = st.columns(2)
+        with c_dl1:
+            st.download_button(
+                label="Download Bookmarked PDF",
+                data=st.session_state['final_pdf_bytes'],
+                file_name="bookmarked_doc.pdf",
+                mime="application/pdf",
+                key="dl_pdf_btn"
+            )
+        
+        with c_dl2:
+            if 'final_zip_bytes' in st.session_state:
+                st.download_button(
+                    label=f"Download ZIP ({st.session_state['final_zip_name']})",
+                    data=st.session_state['final_zip_bytes'],
+                    file_name=st.session_state['final_zip_name'],
+                    mime="application/zip",
+                    key="dl_zip_btn"
+                )
+
+    st.markdown("---")
+    st.markdown("""
+    <div style="text-align: center; color: #666; font-size: 0.8em;">
+        Based on <a href="https://github.com/Krasjet/pdf.tocgen" target="_blank">pdf.tocgen</a> by krasjet. <br>
+        Enhanced with UI, Chapter Splitting, and Metadata Search. Licensed under AGPL-3.0.
+    </div>
+    """, unsafe_allow_html=True)
diff --git a/fitzutils/__init__.py b/fitzutils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af8abf9c7e0f10603381e66862c2618a1aa0ffd
--- /dev/null
+++ b/fitzutils/__init__.py
@@ -0,0 +1,17 @@
+"""A collection of utility functions to work with PyMuPDF"""
+
+from .fitzutils import (
+    open_pdf,
+    ToCEntry,
+    dump_toc,
+    pprint_toc,
+    get_file_encoding
+)
+
+__all__ = [
+    'open_pdf',
+    'ToCEntry',
+    'dump_toc',
+    'pprint_toc',
+    'get_file_encoding'
+]
diff --git a/fitzutils/fitzutils.py b/fitzutils/fitzutils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1167b3c2e9308e32fc2ad43d66bca3cbfc8ec7
--- /dev/null
+++ b/fitzutils/fitzutils.py
@@ -0,0 +1,112 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from typing import Optional, ContextManager, List, Tuple
+from fitz import Document
+
+import sys
+import fitz
+import io
+import csv
+import chardet
+
+
+@contextmanager
+def open_pdf(path: str,
+             exit_on_error: bool = True
+             ) -> ContextManager[Optional[Document]]:
+    """A context manager for fitz Document
+
+    This context manager will take care of the error handling when creating a
+    fitz Document.
+
+    Arguments
+      path: the path of the pdf file
+      exit_on_error: if true, exit with error code 1 when error occurs
+    """
+    try:
+        doc = fitz.open(path)
+    except Exception as e:
+        if exit_on_error:
+            print(f"error: fail to open {path}", file=sys.stderr)
+            print(e, file=sys.stderr)
+            sys.exit(1)
+        else:
+            yield None
+    else:
+        try:
+            yield doc
+        finally:
+            doc.close()
+
+
+@dataclass
+class ToCEntry:
+    """A single entry in the table of contents"""
+    level: int
+    title: str
+    pagenum: int
+    # vpos == bbox.top, used for sorting
+    vpos: Optional[float] = None
+
+    @staticmethod
+    def key(e) -> Tuple[int, float]:
+        """Key used for sorting"""
+        return (e.pagenum, 0 if e.vpos is None else e.vpos)
+
+    def to_fitz_entry(self) -> list:
+        return ([self.level, self.title, self.pagenum] +
+                [self.vpos] * (self.vpos is not None))
+
+
+def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str:
+    """Dump table of contents as a CSV dialect
+
+    We will use indentations to represent the level of each entry, except that,
+    everything should be similar to the normal CSV.
+
+    Argument
+      entries: a list of ToC entries
+      dump_vpos: if true, the vertical position of a page is also dumped
+    Returns
+      a multiline string
+    """
+    with io.StringIO(newline='\n') as out:
+        writer = csv.writer(out, lineterminator='\n',
+                            delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
+        for entry in entries:
+            out.write((entry.level - 1) * '    ')
+            writer.writerow(
+                [entry.title, entry.pagenum] +
+                ([entry.vpos] * (dump_vpos and entry.vpos is not None))
+            )
+        return out.getvalue()
+
+
+def pprint_toc(entries: List[ToCEntry]) -> str:
+    """Pretty print table of contents
+
+    Argument
+      entries: a list of ToC entries
+    Returns
+      a multiline string
+    """
+    return '\n'.join([
+        f"{(entry.level - 1) * '    '}{entry.title} ··· {entry.pagenum}"
+        for entry in entries
+    ])
+
+
+def get_file_encoding(path: str) -> str:
+    """Get encoding of file
+
+    Argument
+      path: file path
+    Returns
+      encoding string
+    """
+    try:
+        with open(path, "rb") as f:
+            enc = chardet.detect(f.read()).encoding
+    except:
+        enc = 'utf-8'
+    return enc
diff --git a/pdftocgen/__init__.py b/pdftocgen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc6cbb261c96a596ae893408f22ae843761788fb
--- /dev/null
+++ b/pdftocgen/__init__.py
@@ -0,0 +1,3 @@
+"""Generate table of contents for pdf based on a recipe file"""
+
+__version__ = '1.3.4'
diff --git a/pdftocgen/__main__.py b/pdftocgen/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdftocgen/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+    main()
diff --git a/pdftocgen/filter.py b/pdftocgen/filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b04b8571df94129fb75aa9f345a219b956583d
--- /dev/null
+++ b/pdftocgen/filter.py
@@ -0,0 +1,161 @@
+"""Filter on span dictionaries
+
+This module contains the internal representation of heading filters, which are
+used to test if a span should be included in the ToC.
+"""
+
+import re
+
+from typing import Optional
+from re import Pattern
+
+DEF_TOLERANCE: float = 1e-5
+
+
+def admits_float(expect: Optional[float],
+                 actual: Optional[float],
+                 tolerance: float) -> bool:
+    """Check if a float should be admitted by a filter"""
+    return (expect is None) or \
+           (actual is not None and abs(expect - actual) <= tolerance)
+
+
+class FontFilter:
+    """Filter on font attributes"""
+    name: Pattern
+    size: Optional[float]
+    size_tolerance: float
+    color: Optional[int]
+    flags: int
+    # besides the usual true (1) and false (0), we have another state,
+    # unset (x), where the truth table would be
+    # a b diff?
+    # 0 0 0
+    # 0 1 1
+    # 1 0 1
+    # 1 1 0
+    # x 0 0
+    # x 1 0
+    # it's very inefficient to compare bit by bit, which would take 5 bitwise
+    # operations to compare, and then 4 to combine the results, we will use a
+    # trick to reduce it to 2 ops.
+    # step 1: use XOR to find different bits. if unset, set bit to 0, we will
+    #         take care of false positives in the next step
+    # a b a^b
+    # 0 0 0
+    # 0 1 1
+    # 1 0 1
+    # 1 1 0
+    # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false
+    #         positives
+    # a b a&b
+    # 0 1 0           <- no diff
+    # 0 0 0           <- no diff
+    # 1 1 1           <- found difference
+    # 1 0 0           <- ignored
+    ign_mask: int
+
+    def __init__(self, font_dict: dict):
+        self.name = re.compile(font_dict.get('name', ""))
+        self.size = font_dict.get('size')
+        self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE)
+        self.color = font_dict.get('color')
+        # some branchless trick, mainly to save space
+        # x * True = x
+        # x * False = 0
+        self.flags = (0b00001 * font_dict.get('superscript', False) |
+                      0b00010 * font_dict.get('italic', False) |
+                      0b00100 * font_dict.get('serif', False) |
+                      0b01000 * font_dict.get('monospace', False) |
+                      0b10000 * font_dict.get('bold', False))
+
+        self.ign_mask = (0b00001 * ('superscript' in font_dict) |
+                         0b00010 * ('italic' in font_dict) |
+                         0b00100 * ('serif' in font_dict) |
+                         0b01000 * ('monospace' in font_dict) |
+                         0b10000 * ('bold' in font_dict))
+
+    def admits(self, spn: dict) -> bool:
+        """Check if the font attributes admit the span
+
+        Argument
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match current font attribute
+        """
+        if not self.name.search(spn.get('font', "")):
+            return False
+
+        if self.color is not None and self.color != spn.get('color'):
+            return False
+
+        if not admits_float(self.size, spn.get('size'), self.size_tolerance):
+            return False
+
+        flags = spn.get('flags', ~self.flags)
+        # see above for explanation
+        return not (flags ^ self.flags) & self.ign_mask
+
+
+class BoundingBoxFilter:
+    """Filter on bounding boxes"""
+    left: Optional[float]
+    top: Optional[float]
+    right: Optional[float]
+    bottom: Optional[float]
+    tolernace: float
+
+    def __init__(self, bbox_dict: dict):
+        self.left = bbox_dict.get('left')
+        self.top = bbox_dict.get('top')
+        self.right = bbox_dict.get('right')
+        self.bottom = bbox_dict.get('bottom')
+        self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE)
+
+    def admits(self, spn: dict) -> bool:
+        """Check if the bounding box admit the span
+
+        Argument
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match current bounding box setting
+        """
+        bbox = spn.get('bbox', (None, None, None, None))
+        return (admits_float(self.left, bbox[0], self.tolerance) and
+                admits_float(self.top, bbox[1], self.tolerance) and
+                admits_float(self.right, bbox[2], self.tolerance) and
+                admits_float(self.bottom, bbox[3], self.tolerance))
+
+
+class ToCFilter:
+    """Filter on span dictionary to pick out headings in the ToC"""
+    # The level of the title, strictly > 0
+    level: int
+    # When set, the filter will be more *greedy* and extract all the text in a
+    # block even when at least one match occurs
+    greedy: bool
+    font: FontFilter
+    bbox: BoundingBoxFilter
+
+    def __init__(self, fltr_dict: dict):
+        lvl = fltr_dict.get('level')
+
+        if lvl is None:
+            raise ValueError("filter's 'level' is not set")
+        if lvl < 1:
+            raise ValueError("filter's 'level' must be >= 1")
+
+        self.level = lvl
+        self.greedy = fltr_dict.get('greedy', False)
+        self.font = FontFilter(fltr_dict.get('font', {}))
+        self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {}))
+
+    def admits(self, spn: dict) -> bool:
+        """Check if the filter admits the span
+
+        Arguments
+          spn: the span dict to be checked
+        Returns
+          False if the span doesn't match the filter
+        """
+        return self.font.admits(spn) and self.bbox.admits(spn)
diff --git a/pdftocgen/recipe.py b/pdftocgen/recipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..010bb31fa61e5edb29792a33fd4c6931bf3fdbee
--- /dev/null
+++ b/pdftocgen/recipe.py
@@ -0,0 +1,188 @@
+from dataclasses import dataclass
+from typing import Optional, List, Dict, Iterator
+from .filter import ToCFilter
+from fitzutils import ToCEntry
+from itertools import chain
+from collections import defaultdict
+from fitz import Document
+
+
+class FoundGreedy(Exception):
+    """A hacky solution to do short-circuiting in Python.
+
+    The main reason to do this short-circuiting is to untangle the logic of
+    greedy filter with normal execution, which makes the typing and code much
+    cleaner, but it can also save some unecessary comparisons.
+
+    Probably similar to call/cc in scheme or longjump in C
+    c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2
+    """
+    level: int
+
+    def __init__(self, level):
+        """
+        Argument
+          level: level of the greedy filter
+        """
+        super().__init__()
+        self.level = level
+
+
+def blk_to_str(blk: dict) -> str:
+    """Extract all the text inside a block"""
+    return " ".join([
+        spn.get('text', "").strip()
+        for line in blk.get('lines', [])
+        for spn in line.get('spans', [])
+    ])
+
+
+@dataclass
+class Fragment:
+    """A fragment of the extracted heading"""
+    text: str
+    level: int
+
+
+def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]:
+    """Concatenate fragments to strings
+
+    Returns
+      a dictionary (level -> title) that contains the title for each level.
+    """
+    # accumulate a list of strings for each level of heading
+    acc = defaultdict(list)
+    for frag in frags:
+        if frag is not None:
+            acc[frag.level].append(frag.text)
+
+    result = {}
+    for level, strs in acc.items():
+        result[level] = sep.join(strs)
+    return result
+
+
+class Recipe:
+    """The internal representation of a recipe"""
+    filters: List[ToCFilter]
+
+    def __init__(self, recipe_dict: dict):
+        fltr_dicts = recipe_dict.get('heading', [])
+
+        if len(fltr_dicts) == 0:
+            raise ValueError("no filters found in recipe")
+        self.filters = [ToCFilter(fltr) for fltr in fltr_dicts]
+
+    def _extract_span(self, spn: dict) -> Optional[Fragment]:
+        """Extract text from span along with level
+
+        Argument
+          spn: a span dictionary
+          {
+            'bbox': (float, float, float, float),
+            'color': int,
+            'flags': int,
+            'font': str,
+            'size': float,
+            'text': str
+          }
+        Returns
+          a fragment of the heading or None if no match
+        """
+        for fltr in self.filters:
+            if fltr.admits(spn):
+                text = spn.get('text', "").strip()
+
+                if not text:
+                    # don't match empty spaces
+                    return None
+
+                if fltr.greedy:
+                    # propagate all the way back to extract_block
+                    raise FoundGreedy(fltr.level)
+
+                return Fragment(text, fltr.level)
+        return None
+
+    def _extract_line(self, line: dict) -> List[Optional[Fragment]]:
+        """Extract matching heading fragments in a line.
+
+        Argument
+          line: a line dictionary
+          {
+            'bbox': (float, float, float, float),
+            'wmode': int,
+            'dir': (float, float),
+            'spans': [dict]
+          }
+        Returns
+          a list of fragments concatenated from result in a line
+        """
+        return [self._extract_span(spn) for spn in line.get('spans', [])]
+
+    def extract_block(self, block: dict, page: int) -> List[ToCEntry]:
+        """Extract matching headings in a block.
+
+        Argument
+          block: a block dictionary
+          {
+            'bbox': (float, float, float, float),
+            'lines': [dict],
+            'type': int
+          }
+        Returns
+          a list of toc entries, concatenated from the result of lines
+        """
+        if block.get('type') != 0:
+            # not a text block
+            return []
+
+        vpos = block.get('bbox', (0, 0))[1]
+
+        try:
+            frags = chain.from_iterable([
+                self._extract_line(ln) for ln in block.get('lines')
+            ])
+            titles = concatFrag(frags)
+
+            return [
+                ToCEntry(level, title, page, vpos)
+                for level, title in titles.items()
+            ]
+        except FoundGreedy as e:
+            # Smart Greedy: Only merged text that MATCHES the filter
+            # Find the filter that triggered this level
+            relevant_filter = next((f for f in self.filters if f.level == e.level), None)
+            
+            parts = []
+            if relevant_filter:
+                for ln in block.get('lines', []):
+                    for spn in ln.get('spans', []):
+                        if relevant_filter.admits(spn):
+                            parts.append(spn.get('text', "").strip())
+            
+            merged_text = " ".join(parts)
+            if merged_text:
+                return [ToCEntry(e.level, merged_text, page, vpos)]
+            else:
+                return []
+
+
+def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]:
+    """Extract toc entries from a document
+
+    Arguments
+      doc: a pdf document
+      recipe: recipe from user
+    Returns
+      a list of toc entries in the document
+    """
+    result = []
+
+    for page in doc.pages():
+        for blk in page.get_textpage().extractDICT().get('blocks', []):
+            result.extend(
+                recipe.extract_block(blk, page.number + 1)
+            )
+
+    return result
diff --git a/pdftocgen/tocgen.py b/pdftocgen/tocgen.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fcb9d21c15c57cec9898c17677d614b2f4351ad
--- /dev/null
+++ b/pdftocgen/tocgen.py
@@ -0,0 +1,15 @@
+from fitz import Document
+from typing import List
+from fitzutils import ToCEntry
+from .recipe import Recipe, extract_toc
+
+def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]:
+    """Generate the table of content for a document from recipe
+
+    Argument
+      doc: a pdf document
+      recipe_dict: the recipe dictionary used to generate the toc
+    Returns
+      a list of ToC entries
+    """
+    return extract_toc(doc, Recipe(recipe_dict))
diff --git a/pdftocio/__init__.py b/pdftocio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe25c642e35a38b9dce6750edd85e315434b27d8
--- /dev/null
+++ b/pdftocio/__init__.py
@@ -0,0 +1,3 @@
+"""Manipulating the table of contents of a pdf"""
+
+__version__ = '1.3.4'
diff --git a/pdftocio/__main__.py b/pdftocio/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdftocio/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+    main()
diff --git a/pdftocio/app.py b/pdftocio/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb14f579b912c2b00e382ba78ce695bb16d97d0
--- /dev/null
+++ b/pdftocio/app.py
@@ -0,0 +1,184 @@
+"""The executable of pdftocio"""
+
+import sys
+import os.path
+import pdftocio
+import getopt
+import io
+
+from typing import Optional, TextIO
+from getopt import GetoptError
+from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding
+from .tocparser import parse_toc
+from .tocio import write_toc, read_toc
+
+usage_s = """
+usage: pdftocio [options] in.pdf < toc
+       pdftocio [options] in.pdf
+""".strip()
+
+help_s = r"""
+usage: pdftocio [options] in.pdf < toc
+       pdftocio [options] in.pdf
+
+Import/output the table of contents of a PDF file.
+
+This command can operate in two ways: it can either be used
+to extract the table of contents of a PDF, or import table
+of contents to a PDF using the output of pdftocgen.
+
+1. To extract the table of contents of a PDF for
+   modification, only supply a input file:
+
+     $ pdftocio in.pdf
+
+   or if you want to print it in a readable format, use the
+   -H flag:
+
+     $ pdftocio -H in.pdf
+
+2. To import a table of contents to a PDF using the toc file
+   generated by pdftocgen, use input redirection,
+
+     $ pdftocio in.pdf < toc
+
+   pipes,
+
+     $ pdftocgen -r recipe.toml in.pdf | pdftocio in.pdf
+
+   or the -t flag
+
+     $ pdftocio -t toc in.pdf
+
+   to supply the toc file. If you want to specify an output
+   file name, use the -o option
+
+     $ pdftocio -t toc -o out.pdf in.pdf
+
+arguments
+  in.pdf                path to the input PDF document
+
+options
+  -h, --help            show help
+  -t, --toc=toc         path to the table of contents generated by
+                        pdftocgen. if this option is not given, the
+                        default is stdin, but if no input is piped or
+                        redirected to stdin, this program will instead
+                        print the existing ToC of the PDF file
+  -v, --vpos            if this flag is set, the vertical position of
+                        each heading will be dumped to the output
+  -p, --print           when flag is set, print the existing ToC in
+                        the input PDF file. this flag is usually not
+                        necessary, since it is the default behavior
+                        when no input is given
+  -H, --human-readable  print the toc in a readable format
+  -o, --out=file.pdf    path to the output file. if this flag is not
+                        specified, the default is {input}_out.pdf
+  -g, --debug           enable debug mode
+  -V, --version         show version number
+
+[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe
+""".strip()
+
+
+def main():
+    # parse arguments
+    try:
+        opts, args = getopt.gnu_getopt(
+            sys.argv[1:],
+            "hvt:pHo:gV",
+            ["help", "vpos", "toc=", "print", "human-readable", "out=", "debug", "version"]
+        )
+    except GetoptError as e:
+        print(e, file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(2)
+
+    toc_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore')
+    print_toc: bool = False
+    readable: bool = False
+    out: Optional[str] = None
+    vpos: bool = False
+    debug: bool = False
+
+    for o, a in opts:
+        if o in ("-H", "--human-readable"):
+            readable = True
+        elif o in ("-p", "--print"):
+            print_toc = True
+        elif o in ("-v", "--vpos"):
+            vpos = True
+        elif o in ("-t", "--toc"):
+            try:
+                toc_file = open(a, "r", encoding=get_file_encoding(a))
+            except IOError as e:
+                print("error: can't open file for reading", file=sys.stderr)
+                print(e, file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-o", "--out"):
+            out = a
+        elif o in ("-g", "--debug"):
+            debug = True
+        elif o in ("-V", "--version"):
+            print("pdftocio", pdftocio.__version__, file=sys.stderr)
+            sys.exit()
+        elif o in ("-h", "--help"):
+            print(help_s, file=sys.stderr)
+            sys.exit()
+
+    if len(args) < 1:
+        print("error: no input pdf is given", file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(1)
+
+    path_in: str = args[0]
+    # done parsing arguments
+
+    try:
+        with open_pdf(path_in) as doc:
+            if toc_file.isatty() or print_toc:
+                # no input from user, switch to output mode and extract the toc
+                # of pdf
+                toc = read_toc(doc)
+                if len(toc) == 0:
+                    print("error: no table of contents found", file=sys.stderr)
+                    sys.exit(1)
+
+                stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
+
+                if readable:
+                    print(pprint_toc(toc), file=stdout)
+                else:
+                    print(dump_toc(toc, vpos), end="", file=stdout)
+                sys.exit(0)
+
+            # an input is given, so switch to input mode
+            toc = parse_toc(toc_file)
+            write_toc(doc, toc)
+
+            if out is None:
+                # add suffix to input name as output
+                pfx, ext = os.path.splitext(path_in)
+                out = f"{pfx}_out{ext}"
+            doc.save(out)
+    except ValueError as e:
+        if debug:
+            raise e
+        print("error:", e, file=sys.stderr)
+        sys.exit(1)
+    except IOError as e:
+        if debug:
+            raise e
+        print("error: unable to open file", file=sys.stderr)
+        print(e, file=sys.stderr)
+        sys.exit(1)
+    except IndexError as e:
+        if debug:
+            raise e
+        print("index error:", e, file=sys.stderr)
+        sys.exit(1)
+    except KeyboardInterrupt as e:
+        if debug:
+            raise e
+        print("error: interrupted", file=sys.stderr)
+        sys.exit(1)
diff --git a/pdftocio/tocio.py b/pdftocio/tocio.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f4fdbc24aefddfd72d3f0bcf72eabe8dfaa3e8
--- /dev/null
+++ b/pdftocio/tocio.py
@@ -0,0 +1,20 @@
+"""Reading and writing table of contents from/to a pdf"""
+
+from typing import List
+from fitz import Document
+from fitzutils import ToCEntry
+
+
+def write_toc(doc: Document, toc: List[ToCEntry]):
+    """Write table of contents to a document"""
+    fitz_toc = list(map(lambda e: e.to_fitz_entry(), toc))
+    doc.set_toc(fitz_toc)
+
+
+def read_toc(doc: Document) -> List[ToCEntry]:
+    """Read table of contents from a document"""
+    return [
+        ToCEntry(e[0], e[1], e[2], e[3]['to'].y) if (len(e) == 4 and 'to' in e[3]) else
+        ToCEntry(e[0], e[1], e[2])
+        for e in doc.get_toc(False)
+    ]
diff --git a/pdftocio/tocparser.py b/pdftocio/tocparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4e6c3031da4d4f052dc617c03ec964d33f81ef
--- /dev/null
+++ b/pdftocio/tocparser.py
@@ -0,0 +1,38 @@
+"""Parser for table of content csv file"""
+
+import csv
+import sys
+
+from typing import IO, List
+from fitzutils import ToCEntry
+from itertools import takewhile
+
+
+def parse_entry(entry: List) -> ToCEntry:
+    """parse a row in csv to a toc entry"""
+
+    # a somewhat weird hack, csv reader would read spaces as an empty '', so we
+    # only need to count the number of '' before an entry to determined the
+    # heading level
+    indent = len(list(takewhile(lambda x: x == '', entry)))
+    try:
+        toc_entry = ToCEntry(
+            int(indent / 4) + 1,     # 4 spaces = 1 level
+            entry[indent],           # heading
+            int(entry[indent + 1]),  # pagenum
+            *entry[indent + 2:]      # vpos
+        )
+        return toc_entry
+    except IndexError as e:
+        print(f"Unable to parse toc entry {entry};",
+              f"Need at least {indent + 2} parts but only have {len(entry)}.",
+              "Make sure the page number is present.",
+              file=sys.stderr)
+        raise e
+
+
+def parse_toc(file: IO) -> List[ToCEntry]:
+    """Parse a toc file to a list of toc entries"""
+    reader = csv.reader(file, lineterminator='\n',
+                        delimiter=' ', quoting=csv.QUOTE_NONNUMERIC)
+    return list(map(parse_entry, reader))
diff --git a/pdfxmeta/__init__.py b/pdfxmeta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..207212c694c3bb9c517747e1e6953ce3666ad8b0
--- /dev/null
+++ b/pdfxmeta/__init__.py
@@ -0,0 +1,5 @@
+"""Extract metadata (fonts, bounding box) for a string in a pdf"""
+
+__version__ = '1.3.4'
+
+from .pdfxmeta import extract_meta, dump_meta, dump_toml
diff --git a/pdfxmeta/__main__.py b/pdfxmeta/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be
--- /dev/null
+++ b/pdfxmeta/__main__.py
@@ -0,0 +1,4 @@
+from .app import main
+
+if __name__ == '__main__':
+    main()
diff --git a/pdfxmeta/app.py b/pdfxmeta/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d27c9f31430574a39e4c0001fd9b93f558c8174
--- /dev/null
+++ b/pdfxmeta/app.py
@@ -0,0 +1,147 @@
+"""The executable of pdfxmeta"""
+
+import getopt
+import sys
+import pdfxmeta
+import io
+
+from getopt import GetoptError
+from typing import Optional, TextIO
+from fitzutils import open_pdf
+from textwrap import indent
+from pdfxmeta import dump_meta, dump_toml, extract_meta
+
+
+usage_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+""".strip()
+
+help_s = """
+usage: pdfxmeta [options] doc.pdf [pattern]
+
+Extract the metadata for pattern in doc.pdf.
+
+To use this command, first open up the pdf file with your
+favorite pdf reader and find the text you want to search
+for. Then use
+
+    $ pdfxmeta -p 1 in.pdf "Subsection One"
+
+to find the metadata, mainly the font attributes and
+bounding box, of lines containing the pattern "Subsection
+One" on page 1. Specifying a page number is optional but
+highly recommended, since it greatly reduces the ambiguity
+of matches and execution time.
+
+The output of this command can be directly copy-pasted to
+build a recipe file for pdftocgen. Alternatively, you could
+also use the --auto or -a flag to output a valid heading
+filter directly
+
+    $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml
+
+where the argument of -a is the level of the heading filter,
+which in this case is 2.
+
+arguments
+  doc.pdf            path to the input PDF document
+  [pattern]          the pattern to search for (python regex). if not
+                     given, dump the entire document
+
+options
+  -h, --help         show help
+  -p, --page=PAGE    specify the page to search for (1-based index)
+  -i, --ignore-case  when flag is set, search will be case-insensitive
+  -a, --auto=LEVEL   when flag is set, the output would be a valid
+                     heading filter of the specified heading level in
+                     default settings. it is directly usable by
+                     pdftocgen.
+  -o, --out=FILE     path to the output file. if this flag is not
+                     specified, the default is stdout
+  -V, --version      show version number
+""".strip()
+
+
+def print_result(meta: dict) -> str:
+    """pretty print results in a structured manner"""
+    return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), '    ')}"
+
+
+def main():
+    # parse arguments
+    try:
+        opts, args = getopt.gnu_getopt(
+            sys.argv[1:],
+            "hiVp:a:o:",
+            ["help", "ignore-case", "version", "page=", "auto=", "out="]
+        )
+    except GetoptError as e:
+        print(e, file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(2)
+
+    ignore_case: bool = False
+    page: Optional[int] = None
+    auto_level: Optional[int] = None
+    out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore')
+
+    for o, a in opts:
+        if o in ("-i", "--ignore-case"):
+            ignore_case = True
+        elif o in ("-p", "--page"):
+            try:
+                page = int(a)
+            except ValueError as e:
+                print("error: invalid page number", file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-a", "--auto"):
+            try:
+                auto_level = int(a)
+            except ValueError as e:
+                print("error: invalid level", file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-o", "--out"):
+            try:
+                out = open(a, "w", encoding='utf-8', errors='ignore')
+            except IOError as e:
+                print("error: can't open file for writing", file=sys.stderr)
+                print(e, file=sys.stderr)
+                sys.exit(1)
+        elif o in ("-V", "--version"):
+            print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr)
+            sys.exit()
+        elif o in ("-h", "--help"):
+            print(help_s, file=sys.stderr)
+            sys.exit()
+
+    argc = len(args)
+
+    if argc < 1:
+        print("error: no input pdf is given", file=sys.stderr)
+        print(usage_s, file=sys.stderr)
+        sys.exit(1)
+
+    path_in: str = args[0]
+    pattern: str = ""
+
+    if argc >= 2:
+        pattern = args[1]
+
+    # done parsing arguments
+
+    with open_pdf(path_in) as doc:
+        meta = extract_meta(doc, pattern, page, ignore_case)
+
+        # nothing found
+        if len(meta) == 0:
+            sys.exit(1)
+
+        # should we add \n between each output?
+        addnl = not out.isatty()
+
+        if auto_level:
+            print('\n'.join(
+                [dump_toml(m, auto_level, addnl) for m in meta]
+            ), file=out)
+        else:
+            print('\n'.join(map(print_result, meta)), file=out)
diff --git a/pdfxmeta/pdfxmeta.py b/pdfxmeta/pdfxmeta.py
new file mode 100644
index 0000000000000000000000000000000000000000..acdb949feefd1a3057074362bc46e3c418eba8c1
--- /dev/null
+++ b/pdfxmeta/pdfxmeta.py
@@ -0,0 +1,194 @@
+"""Extract metadata for a string in a pdf file"""
+
+from toml.encoder import _dump_str, _dump_float
+
+import re
+
+from fitz import Document, Page
+from typing import Optional, List
+
+
+def extract_meta(doc: Document,
+                 pattern: str,
+                 page: Optional[int] = None,
+                 ign_case: bool = False
+                 ) -> List[dict]:
+    """Extract meta for a `pattern` on `page` in a pdf document
+
+    Arguments
+      doc: document from pymupdf
+      pattern: a regular expression pattern
+      page: page number (1-based index), if None is given, search for the
+            entire document, but this is highly discouraged.
+      ign_case: ignore case?
+    """
+    result = []
+
+    if page is None:
+        pages = doc.pages()
+    elif 1 <= page <= doc.page_count:
+        pages = [doc[page - 1]]
+    else:  # page out of range
+        return result
+
+    regex = re.compile(
+        pattern,
+        re.IGNORECASE
+    ) if ign_case else re.compile(pattern)
+
+    # we could parallelize this, but I don't see a reason
+    # to *not* specify a page number
+    for p in pages:
+        found = search_in_page(regex, p)
+        for s in found:
+            s['page_index'] = p.number + 1
+            try:
+                s['page_label'] = p.get_label()
+            except Exception:
+                # Fallback if get_label fails due to PyMuPDF version issues
+                s['page_label'] = ""
+        result.extend(found)
+
+    return result
+
+
+def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
+    """Search for `text` in `page` and extract meta using optimized search_for"""
+    result = []
+
+    # 1. Use simple string search if regex is just a literal (optimization)
+    # But since we have a compiled regex, we might need to extract the pattern if it's simple
+    # Or just use the regex to find matches in the FULL text of the page first?
+    # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core.
+    # However, for the purpose of this tool which claims regex support, we have a dilemma.
+    # But most users searching "Chapter 1" are doing literal searches.
+    
+    # If we want to support the user's "Divided World", we need to handle the case where it might be split.
+    # The most robust way for PDF text search is usually:
+    # 1. Get all text (with position).
+    # 2. Run regex on the full text.
+    # 3. Map match back to bbox.
+    # 4. Find spans in bbox.
+    
+    # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue:
+    # The previous code iterated every span. 
+    # Let's try to be smarter.
+    
+    # For now, let's assume the user pattern is often a literal or we can approximate it.
+    # If the user provides a regex, we can't easily use search_for.
+    # However, the user provided "Divided World".
+    
+    # Let's fallback to the robust get_text("dict") but optimize the check?
+    # No, get_text("dict") IS the slow part.
+    
+    # Alternative:
+    # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")?
+    # That saves time for pages that DON'T match.
+    
+    # Improved Algorithm:
+    # 1. Extract plain text of the page.
+    # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization)
+    # 3. If it does match, perform the detailed span search.
+
+    text_content = page.get_text()
+    if not regex.search(text_content):
+        return []
+
+    # If we are here, there is a match on this page. Now find the exact spans.
+    # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match.
+    # But at least it won't spin on empty pages.
+    
+    page_meta = page.get_textpage().extractDICT()
+
+    for blk in page_meta.get('blocks', []):
+        for ln in blk.get('lines', []):
+            for spn in ln.get('spans', []):
+                text = spn.get('text', "")
+                if regex.search(text):
+                    result.append(spn)
+    return result
+
+
+def to_bools(var: int) -> str:
+    """Convert int to lowercase bool string"""
+    return str(var != 0).lower()
+
+
+def dump_meta(spn: dict) -> str:
+    """Dump the span dict from PyMuPDF to TOML compatible string"""
+    result = []
+
+    if 'page_index' in spn:
+        result.append(f"page.index = {spn['page_index']}")
+    if 'page_label' in spn:
+        result.append(f"page.label = \"{spn['page_label']}\"")
+
+    result.append(f"font.name = {_dump_str(spn['font'])}")
+    result.append(f"font.size = {_dump_float(spn['size'])}")
+    result.append(f"font.color = {spn['color']:#08x}")
+
+    flags = spn['flags']
+
+    result.append(f"font.superscript = {to_bools(flags & 0b00001)}")
+    result.append(f"font.italic = {to_bools(flags & 0b00010)}")
+    result.append(f"font.serif = {to_bools(flags & 0b00100)}")
+    result.append(f"font.monospace = {to_bools(flags & 0b01000)}")
+    result.append(f"font.bold = {to_bools(flags & 0b10000)}")
+
+    bbox = spn['bbox']
+
+    result.append(f"bbox.left = {_dump_float(bbox[0])}")
+    result.append(f"bbox.top = {_dump_float(bbox[1])}")
+    result.append(f"bbox.right = {_dump_float(bbox[2])}")
+    result.append(f"bbox.bottom = {_dump_float(bbox[3])}")
+
+    return '\n'.join(result)
+
+
+def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str:
+    """Dump a valid TOML directly usable by pdftocgen
+
+    Argument
+      spn: span dict of the heading
+      level: heading level
+      trail_nl: add trailing new line
+    Returns
+      a valid toml string
+    """
+    result = []
+
+    result.append("[[heading]]")
+    result.append(f"# {spn.get('text', '')}")
+    result.append(f"level = {level}")
+    result.append("greedy = true")
+
+    # strip font subset prefix
+    # == takeWhile (\c -> c /= '+') str
+    before, sep, after = spn['font'].partition('+')
+    font = after if sep else before
+
+    result.append(f"font.name = {_dump_str(font)}")
+    result.append(f"font.size = {_dump_float(spn['size'])}")
+    result.append("# font.size_tolerance = 1e-5")
+    result.append(f"# font.color = {spn['color']:#08x}")
+
+    flags = spn['flags']
+
+    result.append(f"# font.superscript = {to_bools(flags & 0b00001)}")
+    result.append(f"# font.italic = {to_bools(flags & 0b00010)}")
+    result.append(f"# font.serif = {to_bools(flags & 0b00100)}")
+    result.append(f"# font.monospace = {to_bools(flags & 0b01000)}")
+    result.append(f"# font.bold = {to_bools(flags & 0b10000)}")
+
+    bbox = spn['bbox']
+
+    result.append(f"# bbox.left = {_dump_float(bbox[0])}")
+    result.append(f"# bbox.top = {_dump_float(bbox[1])}")
+    result.append(f"# bbox.right = {_dump_float(bbox[2])}")
+    result.append(f"# bbox.bottom = {_dump_float(bbox[3])}")
+    result.append("# bbox.tolerance = 1e-5")
+
+    if trail_nl:
+        result.append("")
+
+    return '\n'.join(result)
diff --git a/poetry.lock b/poetry.lock
new file mode 100644
index 0000000000000000000000000000000000000000..a86919f10476ffc87b01327ec1399cdd1bf39d9d
--- /dev/null
+++ b/poetry.lock
@@ -0,0 +1,534 @@
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
+
+[[package]]
+name = "args"
+version = "0.1.0"
+description = "Command Arguments for Humans."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "args-0.1.0.tar.gz", hash = "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"},
+]
+
+[[package]]
+name = "astroid"
+version = "2.11.7"
+description = "An abstract syntax tree for Python with inference support."
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+    {file = "astroid-2.11.7-py3-none-any.whl", hash = "sha256:86b0a340a512c65abf4368b80252754cda17c02cdbbd3f587dddf98112233e7b"},
+    {file = "astroid-2.11.7.tar.gz", hash = "sha256:bb24615c77f4837c707669d16907331374ae8a964650a66999da3f5ca68dc946"},
+]
+
+[package.dependencies]
+lazy-object-proxy = ">=1.4.0"
+setuptools = ">=20.0"
+typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""}
+typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""}
+wrapt = ">=1.11,<2"
+
+[[package]]
+name = "chardet"
+version = "5.1.0"
+description = "Universal encoding detector for Python 3"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
+    {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
+]
+
+[[package]]
+name = "clint"
+version = "0.5.1"
+description = "Python Command Line Interface Tools"
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "clint-0.5.1.tar.gz", hash = "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"},
+]
+
+[package.dependencies]
+args = "*"
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+description = "Cross-platform colored terminal text."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+files = [
+    {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
+    {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
+]
+
+[[package]]
+name = "coverage"
+version = "7.2.3"
+description = "Code coverage measurement for Python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "coverage-7.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e58c0d41d336569d63d1b113bd573db8363bc4146f39444125b7f8060e4e04f5"},
+    {file = "coverage-7.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:344e714bd0fe921fc72d97404ebbdbf9127bac0ca1ff66d7b79efc143cf7c0c4"},
+    {file = "coverage-7.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974bc90d6f6c1e59ceb1516ab00cf1cdfbb2e555795d49fa9571d611f449bcb2"},
+    {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0743b0035d4b0e32bc1df5de70fba3059662ace5b9a2a86a9f894cfe66569013"},
+    {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d0391fb4cfc171ce40437f67eb050a340fdbd0f9f49d6353a387f1b7f9dd4fa"},
+    {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a42e1eff0ca9a7cb7dc9ecda41dfc7cbc17cb1d02117214be0561bd1134772b"},
+    {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:be19931a8dcbe6ab464f3339966856996b12a00f9fe53f346ab3be872d03e257"},
+    {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72fcae5bcac3333a4cf3b8f34eec99cea1187acd55af723bcbd559adfdcb5535"},
+    {file = "coverage-7.2.3-cp310-cp310-win32.whl", hash = "sha256:aeae2aa38395b18106e552833f2a50c27ea0000122bde421c31d11ed7e6f9c91"},
+    {file = "coverage-7.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:83957d349838a636e768251c7e9979e899a569794b44c3728eaebd11d848e58e"},
+    {file = "coverage-7.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfd393094cd82ceb9b40df4c77976015a314b267d498268a076e940fe7be6b79"},
+    {file = "coverage-7.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182eb9ac3f2b4874a1f41b78b87db20b66da6b9cdc32737fbbf4fea0c35b23fc"},
+    {file = "coverage-7.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bb1e77a9a311346294621be905ea8a2c30d3ad371fc15bb72e98bfcfae532df"},
+    {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0f34363e2634deffd390a0fef1aa99168ae9ed2af01af4a1f5865e362f8623"},
+    {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55416d7385774285b6e2a5feca0af9652f7f444a4fa3d29d8ab052fafef9d00d"},
+    {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:06ddd9c0249a0546997fdda5a30fbcb40f23926df0a874a60a8a185bc3a87d93"},
+    {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fff5aaa6becf2c6a1699ae6a39e2e6fb0672c2d42eca8eb0cafa91cf2e9bd312"},
+    {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ea53151d87c52e98133eb8ac78f1206498c015849662ca8dc246255265d9c3c4"},
+    {file = "coverage-7.2.3-cp311-cp311-win32.whl", hash = "sha256:8f6c930fd70d91ddee53194e93029e3ef2aabe26725aa3c2753df057e296b925"},
+    {file = "coverage-7.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa546d66639d69aa967bf08156eb8c9d0cd6f6de84be9e8c9819f52ad499c910"},
+    {file = "coverage-7.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b2317d5ed777bf5a033e83d4f1389fd4ef045763141d8f10eb09a7035cee774c"},
+    {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be9824c1c874b73b96288c6d3de793bf7f3a597770205068c6163ea1f326e8b9"},
+    {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3b2803e730dc2797a017335827e9da6da0e84c745ce0f552e66400abdfb9a1"},
+    {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f69770f5ca1994cb32c38965e95f57504d3aea96b6c024624fdd5bb1aa494a1"},
+    {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1127b16220f7bfb3f1049ed4a62d26d81970a723544e8252db0efde853268e21"},
+    {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:aa784405f0c640940595fa0f14064d8e84aff0b0f762fa18393e2760a2cf5841"},
+    {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3146b8e16fa60427e03884301bf8209221f5761ac754ee6b267642a2fd354c48"},
+    {file = "coverage-7.2.3-cp37-cp37m-win32.whl", hash = "sha256:1fd78b911aea9cec3b7e1e2622c8018d51c0d2bbcf8faaf53c2497eb114911c1"},
+    {file = "coverage-7.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f3736a5d34e091b0a611964c6262fd68ca4363df56185902528f0b75dbb9c1f"},
+    {file = "coverage-7.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:981b4df72c93e3bc04478153df516d385317628bd9c10be699c93c26ddcca8ab"},
+    {file = "coverage-7.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0045f8f23a5fb30b2eb3b8a83664d8dc4fb58faddf8155d7109166adb9f2040"},
+    {file = "coverage-7.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f760073fcf8f3d6933178d67754f4f2d4e924e321f4bb0dcef0424ca0215eba1"},
+    {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c86bd45d1659b1ae3d0ba1909326b03598affbc9ed71520e0ff8c31a993ad911"},
+    {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:172db976ae6327ed4728e2507daf8a4de73c7cc89796483e0a9198fd2e47b462"},
+    {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d2a3a6146fe9319926e1d477842ca2a63fe99af5ae690b1f5c11e6af074a6b5c"},
+    {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f649dd53833b495c3ebd04d6eec58479454a1784987af8afb77540d6c1767abd"},
+    {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7c4ed4e9f3b123aa403ab424430b426a1992e6f4c8fd3cb56ea520446e04d152"},
+    {file = "coverage-7.2.3-cp38-cp38-win32.whl", hash = "sha256:eb0edc3ce9760d2f21637766c3aa04822030e7451981ce569a1b3456b7053f22"},
+    {file = "coverage-7.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:63cdeaac4ae85a179a8d6bc09b77b564c096250d759eed343a89d91bce8b6367"},
+    {file = "coverage-7.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20d1a2a76bb4eb00e4d36b9699f9b7aba93271c9c29220ad4c6a9581a0320235"},
+    {file = "coverage-7.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ea748802cc0de4de92ef8244dd84ffd793bd2e7be784cd8394d557a3c751e21"},
+    {file = "coverage-7.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b154aba06df42e4b96fc915512ab39595105f6c483991287021ed95776d934"},
+    {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd214917cabdd6f673a29d708574e9fbdb892cb77eb426d0eae3490d95ca7859"},
+    {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2e58e45fe53fab81f85474e5d4d226eeab0f27b45aa062856c89389da2f0d9"},
+    {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87ecc7c9a1a9f912e306997ffee020297ccb5ea388421fe62a2a02747e4d5539"},
+    {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:387065e420aed3c71b61af7e82c7b6bc1c592f7e3c7a66e9f78dd178699da4fe"},
+    {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ea3f5bc91d7d457da7d48c7a732beaf79d0c8131df3ab278e6bba6297e23c6c4"},
+    {file = "coverage-7.2.3-cp39-cp39-win32.whl", hash = "sha256:ae7863a1d8db6a014b6f2ff9c1582ab1aad55a6d25bac19710a8df68921b6e30"},
+    {file = "coverage-7.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f04becd4fcda03c0160d0da9c8f0c246bc78f2f7af0feea1ec0930e7c93fa4a"},
+    {file = "coverage-7.2.3-pp37.pp38.pp39-none-any.whl", hash = "sha256:965ee3e782c7892befc25575fa171b521d33798132692df428a09efacaffe8d0"},
+    {file = "coverage-7.2.3.tar.gz", hash = "sha256:d298c2815fa4891edd9abe5ad6e6cb4207104c7dd9fd13aea3fdebf6f9b91259"},
+]
+
+[package.extras]
+toml = ["tomli"]
+
+[[package]]
+name = "dill"
+version = "0.3.6"
+description = "serialize all of python"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "dill-0.3.6-py3-none-any.whl", hash = "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0"},
+    {file = "dill-0.3.6.tar.gz", hash = "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"},
+]
+
+[package.extras]
+graph = ["objgraph (>=1.7.2)"]
+
+[[package]]
+name = "isort"
+version = "5.11.5"
+description = "A Python utility / library to sort Python imports."
+category = "dev"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "isort-5.11.5-py3-none-any.whl", hash = "sha256:ba1d72fb2595a01c7895a5128f9585a5cc4b6d395f1c8d514989b9a7eb2a8746"},
+    {file = "isort-5.11.5.tar.gz", hash = "sha256:6be1f76a507cb2ecf16c7cf14a37e41609ca082330be4e3436a18ef74add55db"},
+]
+
+[package.extras]
+colors = ["colorama (>=0.4.3,<0.5.0)"]
+pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"]
+plugins = ["setuptools"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
+
+[[package]]
+name = "jedi"
+version = "0.17.2"
+description = "An autocompletion tool for Python that can be used for text editors."
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+files = [
+    {file = "jedi-0.17.2-py2.py3-none-any.whl", hash = "sha256:98cc583fa0f2f8304968199b01b6b4b94f469a1f4a74c1560506ca2a211378b5"},
+    {file = "jedi-0.17.2.tar.gz", hash = "sha256:86ed7d9b750603e4ba582ea8edc678657fb4007894a12bcf6f4bb97892f31d20"},
+]
+
+[package.dependencies]
+parso = ">=0.7.0,<0.8.0"
+
+[package.extras]
+qa = ["flake8 (==3.7.9)"]
+testing = ["Django (<3.1)", "colorama", "docopt", "pytest (>=3.9.0,<5.0.0)"]
+
+[[package]]
+name = "lazy-object-proxy"
+version = "1.9.0"
+description = "A fast and thorough lazy object proxy."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "lazy-object-proxy-1.9.0.tar.gz", hash = "sha256:659fb5809fa4629b8a1ac5106f669cfc7bef26fbb389dda53b3e010d1ac4ebae"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b40387277b0ed2d0602b8293b94d7257e17d1479e257b4de114ea11a8cb7f2d7"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8c6cfb338b133fbdbc5cfaa10fe3c6aeea827db80c978dbd13bc9dd8526b7d4"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:721532711daa7db0d8b779b0bb0318fa87af1c10d7fe5e52ef30f8eff254d0cd"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66a3de4a3ec06cd8af3f61b8e1ec67614fbb7c995d02fa224813cb7afefee701"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1aa3de4088c89a1b69f8ec0dcc169aa725b0ff017899ac568fe44ddc1396df46"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-win32.whl", hash = "sha256:f0705c376533ed2a9e5e97aacdbfe04cecd71e0aa84c7c0595d02ef93b6e4455"},
+    {file = "lazy_object_proxy-1.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea806fd4c37bf7e7ad82537b0757999264d5f70c45468447bb2b91afdbe73a6e"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:946d27deaff6cf8452ed0dba83ba38839a87f4f7a9732e8f9fd4107b21e6ff07"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a31b086e7e68b24b99b23d57723ef7e2c6d81ed21007b6281ebcd1688acb0a"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f699ac1c768270c9e384e4cbd268d6e67aebcfae6cd623b4d7c3bfde5a35db59"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfb38f9ffb53b942f2b5954e0f610f1e721ccebe9cce9025a38c8ccf4a5183a4"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:189bbd5d41ae7a498397287c408617fe5c48633e7755287b21d741f7db2706a9"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-win32.whl", hash = "sha256:81fc4d08b062b535d95c9ea70dbe8a335c45c04029878e62d744bdced5141586"},
+    {file = "lazy_object_proxy-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:f2457189d8257dd41ae9b434ba33298aec198e30adf2dcdaaa3a28b9994f6adb"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d9e25ef10a39e8afe59a5c348a4dbf29b4868ab76269f81ce1674494e2565a6e"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cbf9b082426036e19c6924a9ce90c740a9861e2bdc27a4834fd0a910742ac1e8"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5fa4a61ce2438267163891961cfd5e32ec97a2c444e5b842d574251ade27d2"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8fa02eaab317b1e9e03f69aab1f91e120e7899b392c4fc19807a8278a07a97e8"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e7c21c95cae3c05c14aafffe2865bbd5e377cfc1348c4f7751d9dc9a48ca4bda"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:f12ad7126ae0c98d601a7ee504c1122bcef553d1d5e0c3bfa77b16b3968d2734"},
+    {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:edd20c5a55acb67c7ed471fa2b5fb66cb17f61430b7a6b9c3b4a1e40293b1671"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0daa332786cf3bb49e10dc6a17a52f6a8f9601b4cf5c295a4f85854d61de63"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cd077f3d04a58e83d04b20e334f678c2b0ff9879b9375ed107d5d07ff160171"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c94ea760b3ce47d1855a30984c78327500493d396eac4dfd8bd82041b22be"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:212774e4dfa851e74d393a2370871e174d7ff0ebc980907723bb67d25c8a7c30"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0117049dd1d5635bbff65444496c90e0baa48ea405125c088e93d9cf4525b11"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-win32.whl", hash = "sha256:0a891e4e41b54fd5b8313b96399f8b0e173bbbfc03c7631f01efbe29bb0bcf82"},
+    {file = "lazy_object_proxy-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9990d8e71b9f6488e91ad25f322898c136b008d87bf852ff65391b004da5e17b"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e7551208b2aded9c1447453ee366f1c4070602b3d932ace044715d89666899b"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f83ac4d83ef0ab017683d715ed356e30dd48a93746309c8f3517e1287523ef4"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7322c3d6f1766d4ef1e51a465f47955f1e8123caee67dd641e67d539a534d006"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:18b78ec83edbbeb69efdc0e9c1cb41a3b1b1ed11ddd8ded602464c3fc6020494"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:09763491ce220c0299688940f8dc2c5d05fd1f45af1e42e636b2e8b2303e4382"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-win32.whl", hash = "sha256:9090d8e53235aa280fc9239a86ae3ea8ac58eff66a705fa6aa2ec4968b95c821"},
+    {file = "lazy_object_proxy-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:db1c1722726f47e10e0b5fdbf15ac3b8adb58c091d12b3ab713965795036985f"},
+]
+
+[[package]]
+name = "mamba"
+version = "0.11.2"
+description = "The definitive testing tool for Python. Born under the banner of Behavior Driven Development."
+category = "dev"
+optional = false
+python-versions = "*"
+files = [
+    {file = "mamba-0.11.2.tar.gz", hash = "sha256:75cfc6dfd287dcccaf86dd753cf48e0a7337487c7c3fafda05a6a67ded6da496"},
+]
+
+[package.dependencies]
+clint = "*"
+coverage = "*"
+
+[[package]]
+name = "mccabe"
+version = "0.7.0"
+description = "McCabe checker, plugin for flake8"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
+    {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
+]
+
+[[package]]
+name = "parso"
+version = "0.7.1"
+description = "A Python Parser"
+category = "dev"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+    {file = "parso-0.7.1-py2.py3-none-any.whl", hash = "sha256:97218d9159b2520ff45eb78028ba8b50d2bc61dcc062a9682666f2dc4bd331ea"},
+    {file = "parso-0.7.1.tar.gz", hash = "sha256:caba44724b994a8a5e086460bb212abc5a8bc46951bf4a9a1210745953622eb9"},
+]
+
+[package.extras]
+testing = ["docopt", "pytest (>=3.0.7)"]
+
+[[package]]
+name = "platformdirs"
+version = "3.2.0"
+description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "platformdirs-3.2.0-py3-none-any.whl", hash = "sha256:ebe11c0d7a805086e99506aa331612429a72ca7cd52a1f0d277dc4adc20cb10e"},
+    {file = "platformdirs-3.2.0.tar.gz", hash = "sha256:d5b638ca397f25f979350ff789db335903d7ea010ab28903f57b27e1b16c2b08"},
+]
+
+[package.dependencies]
+typing-extensions = {version = ">=4.5", markers = "python_version < \"3.8\""}
+
+[package.extras]
+docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"]
+test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
+
+[[package]]
+name = "pylint"
+version = "2.13.9"
+description = "python code static checker"
+category = "dev"
+optional = false
+python-versions = ">=3.6.2"
+files = [
+    {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"},
+    {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"},
+]
+
+[package.dependencies]
+astroid = ">=2.11.5,<=2.12.0-dev0"
+colorama = {version = "*", markers = "sys_platform == \"win32\""}
+dill = ">=0.2"
+isort = ">=4.2.5,<6"
+mccabe = ">=0.6,<0.8"
+platformdirs = ">=2.2.0"
+tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
+typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+testutil = ["gitpython (>3)"]
+
+[[package]]
+name = "pymupdf"
+version = "1.22.1"
+description = "Python bindings for the PDF toolkit and renderer MuPDF"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6bda7a64a1263f1c2b6421ae8803db50d4c8a67de95e05d7a38c313de913b0de"},
+    {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b5f62ad244b04b7aa5e7d50b06b8bbc582b2f1d0f2c66013051463d63dfe6c5e"},
+    {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce633b9d522528959988647dfbd2c9144ad5422dd75e89e60039da36a412fd3c"},
+    {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:733e7b87765ea55202b042b7c84c6b94185ee29fe3a2bd2ee02681c0fd584033"},
+    {file = "PyMuPDF-1.22.1-cp310-cp310-win32.whl", hash = "sha256:701499f0a17ccc8dd80707dbeb3a2e60657a6bdc05be7c8c69fa60eb134e1805"},
+    {file = "PyMuPDF-1.22.1-cp310-cp310-win_amd64.whl", hash = "sha256:81fa90d157ef7b2ecd72eedafe9db56d3b0f8c3b392d7a2057f659bfcc1f7cad"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4edac1dd8e5c35b55420925b5486bec4427b07a073cd03f6081b7234ed37217e"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7744b9853fc55df75f6d37a376432eddd450c1d2072f6ef66b392b7229bccdc6"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:711adc70d664cdd5d361154bb3485546eaa5e8a90827db6abf9c42ca292aa9e1"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d77a3057ad7fc3e2e02e5fedd53199206a49c4b4c5e3ee75458c17d6b739cb"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-win32.whl", hash = "sha256:b5eca48ea55eafcea68b14669a9f5030c15056431b10710d863de9f9a6b1a0ce"},
+    {file = "PyMuPDF-1.22.1-cp311-cp311-win_amd64.whl", hash = "sha256:8e0bfbd6195f45326f9182fff04ac2af9568d78fc1f32dcfa15f84a302d8aafe"},
+    {file = "PyMuPDF-1.22.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:440efca115e70c8cdfc492e98b182e24c565d8e68f26754e28e61cf108a915d9"},
+    {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a70ab2d38b366c7237adce7d54f3028a7825f165a73c137a1746a6b592d26bb2"},
+    {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e4a924ffecb8046fbfe7dff9b69f9938389f094dccab07a378850bf9f889c62"},
+    {file = "PyMuPDF-1.22.1-cp37-cp37m-win32.whl", hash = "sha256:24e66c2ff4d6cfee5b082c3e2c92b40214799888bf2efcca1f70108c3dfedddb"},
+    {file = "PyMuPDF-1.22.1-cp37-cp37m-win_amd64.whl", hash = "sha256:51504bfa2ee207c5c1a38d47b4b91af1bacbd8937b959d947d81fc8f7e023bd8"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:219337a3be00df2bf65071d5e4e1e6759afd06310d4ec7b1c9694a5b03b5d8d6"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:050719cb42a8847d564af1d8509d7290176e7c4fde6da7be5751303fa8237aed"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5871b9e38e68b92533fb7c6fbe3eb7b059f5071d4c2e3ff51cedcc73c994afbc"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5a0332d6dac4ebf32cb7f0c8639b22b56c9475cb87bc0a0361f9cdc9c2d08a1"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-win32.whl", hash = "sha256:127985812c4a2f0106375c4f4916ca68c1559d6b224a050ce75393e454333995"},
+    {file = "PyMuPDF-1.22.1-cp38-cp38-win_amd64.whl", hash = "sha256:99764c46fb8df253a3ea9fbb13b132f205561d6227b0d00e673998b18d7280eb"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fdb21332d28567e278008dd6130564ac0f5de8aff364a1e7809a70a0f969df26"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:88202e42d957a41deff212dcb1d8e16e469d21d09a72ab372ee2f173a22112c8"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36b7fd85f5813045f10b65caf4cbdad03b51b07076f07b205853a1e44c898e34"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45e601f7b1ee2a0c1a261bb0179eba4a9899117404eccf0a573e6497ed507ea8"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-win32.whl", hash = "sha256:c610acdbd2f2d994130341559f26c098df546a1fc187adee3b63a0f489310808"},
+    {file = "PyMuPDF-1.22.1-cp39-cp39-win_amd64.whl", hash = "sha256:af1e6d5dd122c097f23a7e89f8c2197310e85a4c8e8f63ff94444188d9bc0a4e"},
+    {file = "PyMuPDF-1.22.1.tar.gz", hash = "sha256:ad34bba78ce147cee50e1dc30fa16f29135a4c3d6a2b1c1b0403ebbcc9fbe4be"},
+]
+
+[[package]]
+name = "setuptools"
+version = "67.7.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "setuptools-67.7.0-py3-none-any.whl", hash = "sha256:888be97fde8cc3afd60f7784e678fa29ee13c4e5362daa7104a93bba33646c50"},
+    {file = "setuptools-67.7.0.tar.gz", hash = "sha256:b7e53a01c6c654d26d2999ee033d8c6125e5fa55f03b7b193f937ae7ac999f22"},
+]
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
+[[package]]
+name = "toml"
+version = "0.10.2"
+description = "Python Library for Tom's Obvious, Minimal Language"
+category = "main"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"},
+    {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
+]
+
+[[package]]
+name = "tomli"
+version = "2.0.1"
+description = "A lil' TOML parser"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
+    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+]
+
+[[package]]
+name = "typed-ast"
+version = "1.5.4"
+description = "a fork of Python 2 and 3 ast modules with type comment support"
+category = "dev"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"},
+    {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"},
+    {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"},
+    {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"},
+    {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"},
+    {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"},
+    {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"},
+    {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"},
+    {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"},
+    {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"},
+    {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"},
+    {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"},
+    {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"},
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.5.0"
+description = "Backported and Experimental Type Hints for Python 3.7+"
+category = "dev"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"},
+    {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"},
+]
+
+[[package]]
+name = "wrapt"
+version = "1.15.0"
+description = "Module for decorators, wrappers and monkey patching."
+category = "dev"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
+files = [
+    {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"},
+    {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"},
+    {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"},
+    {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"},
+    {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"},
+    {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"},
+    {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"},
+    {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"},
+    {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"},
+    {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"},
+    {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"},
+    {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"},
+    {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"},
+    {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"},
+    {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"},
+    {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"},
+    {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"},
+    {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"},
+    {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"},
+    {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"},
+    {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"},
+    {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"},
+    {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"},
+    {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"},
+    {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"},
+    {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"},
+    {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"},
+    {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"},
+    {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"},
+    {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"},
+    {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"},
+    {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"},
+    {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"},
+    {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"},
+    {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"},
+    {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"},
+    {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"},
+    {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"},
+    {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"},
+    {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"},
+    {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"},
+    {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = "^3.7"
+content-hash = "6dd48af9ea10e0d441e2b6ee3dcdea67bd5b4cc0b6c13b672761212decbaa5f6"
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ac1e2619053761b11ab2f5e246587fe4e96e1a60
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,43 @@
+[tool.poetry]
+name = "pdf.tocgen"
+version = "1.3.4"
+description = "Automatically generate table of contents for pdf files"
+authors = ["krasjet"]
+license = "GPL-3.0-or-later"
+readme = "README.md"
+homepage = "https://krasjet.com/voice/pdf.tocgen/"
+repository = "https://github.com/Krasjet/pdf.tocgen"
+keywords = ["pdf", "cli"]
+
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Environment :: Console",
+  "Intended Audience :: End Users/Desktop"
+]
+
+packages = [
+  { include = "pdfxmeta" },
+  { include = "pdftocgen" },
+  { include = "pdftocio" },
+  { include = "fitzutils" }
+]
+
+[tool.poetry.dependencies]
+python = "^3.7"
+PyMuPDF = "^1.18.14"
+toml = "^0.10.2"
+chardet = "^5.1.0"
+
+[tool.poetry.dev-dependencies]
+pylint = "^2.5.3"
+jedi = "^0.17.2"
+mamba = "^0.11.1"
+
+[tool.poetry.scripts]
+pdfxmeta = "pdfxmeta.app:main"
+pdftocgen = "pdftocgen.app:main"
+pdftocio = "pdftocio.app:main"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/recipes/README.md b/recipes/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c394485bc24eba8adf10a98f4492aac0a3acb11
--- /dev/null
+++ b/recipes/README.md
@@ -0,0 +1,11 @@
+recipes
+=======
+
+This directory contains some pre-made recipes for `pdftocgen`. It could be a
+good reference if you want to craft your own recipes. Feel free to contribute
+more.
+
+The recipes in this directory is separately licensed under the [CC BY-NC-SA 4.0
+License][cc] to prevent any commercial usage.
+
+[cc]: https://creativecommons.org/licenses/by-nc-sa/4.0/
diff --git a/recipes/default_groff_man.toml b/recipes/default_groff_man.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ab3a4690848d38c09d643a7ae01f5f6622768ade
--- /dev/null
+++ b/recipes/default_groff_man.toml
@@ -0,0 +1,12 @@
+# The recipe for
+#   $ man -Tpdf man > out.pdf
+# only tested under groff
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 10.949999809265137
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
diff --git a/recipes/default_groff_ms.toml b/recipes/default_groff_ms.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9e59a1daf1f8c42b3aceeee59700419b905a1017
--- /dev/null
+++ b/recipes/default_groff_ms.toml
@@ -0,0 +1,12 @@
+# The recipe for the default groff_ms, produced by
+# $ groff -ms -Tpdf in.ms > out.pdf
+
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 10
+bbox.left = 72
+
+# All the headings (.NH) have the same font attributes, so you need to manually
+# format the heading levels of the toc (for vim users, >> in normal mode will
+# add indentation to a line)
diff --git a/recipes/default_latex.toml b/recipes/default_latex.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f79bacf0d0c7e2b431710240f2e3ff0eb6de1f8a
--- /dev/null
+++ b/recipes/default_latex.toml
@@ -0,0 +1,24 @@
+# The recipe for
+#   $ pdflatex in.tex
+# under default styles (Computer Modern, article class)
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+font.size_tolerance = 0.01
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 11.9552001953125
+font.size_tolerance = 0.01
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "CMBX10"
+font.size = 9.962599754333496
+font.size_tolerance = 0.01
diff --git a/recipes/ft.toml b/recipes/ft.toml
new file mode 100644
index 0000000000000000000000000000000000000000..eb59c5638bde1ea0a360fa0065c37cc55adeb431
--- /dev/null
+++ b/recipes/ft.toml
@@ -0,0 +1,23 @@
+# The recipe for "Lecture Notes for EE 261" [1] by Brad Osgood
+#
+# [1]: https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf
+# archive: https://web.archive.org/https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 24.78696060180664
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346190452575684
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "CMBX12"
+font.size = 11.955169677734375
+
diff --git a/recipes/htdc.toml b/recipes/htdc.toml
new file mode 100644
index 0000000000000000000000000000000000000000..3783190ace58e29965265b9611ad95887b612fd4
--- /dev/null
+++ b/recipes/htdc.toml
@@ -0,0 +1,26 @@
+# The recipe for HtDC by Matthias Felleisen, et al.
+#
+# The output need some manual clean up. For example, the table of contents in
+# the original document is incorrectedly included in the outline, but they
+# should be easy to remove using a text editor.
+#
+# [1]: https://felleisen.org/matthias/HtDC/htdc.pdf
+
+[[heading]]
+level = 1
+font.name = "Palatino-Bold"
+font.size = 17.21540069580078
+font.color = 0x221f1f
+
+[[heading]]
+level = 2
+font.name = "Palatino-Bold"
+font.size = 14.346199989318848
+font.color = 0x221f1f
+
+[[heading]]
+level = 3
+greedy = true
+font.name = "Palatino-Bold"
+font.size = 11.9552001953125
+font.color = 0x221f1f
diff --git a/recipes/onlisp.toml b/recipes/onlisp.toml
new file mode 100644
index 0000000000000000000000000000000000000000..00873f96b8f6bd82e964419ba0e75bbcdc6b9e48
--- /dev/null
+++ b/recipes/onlisp.toml
@@ -0,0 +1,15 @@
+# The recipe for "On Lisp" [1] by Paul Graham
+#
+# Note that you need to download the PDF version. The PDF is well structured
+# and no extra processing is needed.
+# [1]: http://www.paulgraham.com/onlisptext.html
+
+[[heading]]
+level = 1
+font.name = "Times-Bold"
+font.size = 19.92530059814453
+
+[[heading]]
+level = 2
+font.name = "Times-Bold"
+font.size = 11.9552001953125
diff --git a/recipes/recipe.toml b/recipes/recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..85a5af2253d5f4db045c3d79c79877277e533e81
--- /dev/null
+++ b/recipes/recipe.toml
@@ -0,0 +1,5 @@
+[[heading]]
+level = 1
+greedy = true
+font.name = "CaslonFiveForty-Roman"
+font.size = 54.10
diff --git a/requirements.txt b/requirements.txt
index 28d994e22f8dd432b51df193562052e315ad95f7..1813feaee654ded47884902fd56ffa99962c83d2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit
\ No newline at end of file
+streamlit
+pandas
+PyMuPDF==1.25.2
+toml
+chardet
+.
diff --git a/spec/__init__.py b/spec/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/spec/cli_spec.sh b/spec/cli_spec.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d65e40dc3e40ddfa5bad92970f2e2f413e8a8dc2
--- /dev/null
+++ b/spec/cli_spec.sh
@@ -0,0 +1,63 @@
+#!/bin/bash -e
+
+SPEC="spec/files"
+
+checkeq() {
+  if res=$(diff "$1" "$2"); then
+    echo "[✓]"
+  else
+    echo "[✗]"
+    printf "%s\n" "$res"
+    return 1
+  fi
+}
+
+it() {
+  printf "  it %s " "$*"
+}
+
+printf "pdfxmeta\n"
+
+it "extracts metadata correctly"
+checkeq <(pdfxmeta -p 1 "$SPEC/level2.pdf" "Section") \
+        "$SPEC/level2_meta"
+
+it "extracts metadata in auto mode correctly"
+checkeq <(pdfxmeta -a 1 -p 1 "$SPEC/level2.pdf" "Section") \
+        "$SPEC/level2_meta.toml"
+
+printf "\npdftocgen\n"
+
+it "generates toc for 2 level heading correctly"
+checkeq <(pdftocgen "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \
+        "$SPEC/level2.toc"
+
+it "generates toc for one page headings correctly"
+checkeq <(pdftocgen "$SPEC/onepage.pdf" < "$SPEC/onepage_greedy.toml") \
+        "$SPEC/onepage.toc"
+
+it "generates toc for hard mode correctly"
+checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml") \
+        "$SPEC/hardmode.toc"
+
+it "generates readable toc"
+checkeq <(pdftocgen -H "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \
+        "$SPEC/level2_h.toc"
+
+printf "\npdftocio\n"
+
+tmpdir=$(mktemp -d)
+
+it "adds toc to pdf and prints toc correctly"
+checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml" | \
+          pdftocio -o "$tmpdir/out.pdf" "$SPEC/hardmode.pdf" && \
+          pdftocio -p "$tmpdir/out.pdf") \
+        "$SPEC/hardmode.toc"
+
+it "prints toc when -p is set"
+checkeq <(pdftocio -p "$SPEC/hastoc.pdf" < $SPEC/level2.toc) \
+        "$SPEC/hastoc.toc"
+
+it "prints toc vpos when -v is set"
+checkeq <(pdftocio -p -v "$SPEC/hastoc.pdf") \
+        "$SPEC/hastoc_v.toc"
diff --git a/spec/files/Makefile b/spec/files/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..bc48f892720067d4b6d07e849ce79366c9da0fc5
--- /dev/null
+++ b/spec/files/Makefile
@@ -0,0 +1,12 @@
+.PHONY: all clean
+
+all: level2.pdf hastoc.pdf onepage.pdf hardmode.pdf
+
+%.pdf: %.tex
+	latexmk -pdf $<
+
+clean:
+	rm -f *.aux *.dvi *.fdb_latexmk *.fls *.log *.out
+
+nuke: clean
+	rm -f *.pdf
diff --git a/spec/files/hardmode.pdf b/spec/files/hardmode.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..aed1b467ef6b4926771892d1c2cc7ade6dd1813b
--- /dev/null
+++ b/spec/files/hardmode.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9be6a1628292675b467b36a503c37ffa4d3073d2ff87d147dced3b3bff394875
+size 110985
diff --git a/spec/files/hardmode.tex b/spec/files/hardmode.tex
new file mode 100644
index 0000000000000000000000000000000000000000..48242a7cd593df79ce1f1142fc8fb617bddd8e08
--- /dev/null
+++ b/spec/files/hardmode.tex
@@ -0,0 +1,68 @@
+\documentclass{article}[12pt]
+
+\usepackage{lipsum}
+\usepackage{multicol}
+\usepackage{amsmath}
+\usepackage{amsfonts}
+\usepackage[USenglish]{babel}
+\usepackage[stretch=10,shrink=10]{microtype}
+\usepackage[left=1.3in,
+            right=1.3in,
+            top=1in,
+            bottom=1in,
+            footskip=.5in]{geometry}
+\setlength{\columnsep}{0.4in}
+
+\renewcommand{\rmdefault}{zpltlf}
+\usepackage{newpxtext}
+% will mess up embeded symbols
+% \usepackage{newpxmath}
+
+\title{The hard mode}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\begin{multicols}{2}
+[
+  \maketitle
+]
+
+\section{Section One}
+
+\lipsum[2-3]
+
+\section{Section $1 + 1 = 2$}
+
+\lipsum[2-1]
+\begin{align*}
+  x^2 + 2 = 4
+\end{align*}
+\lipsum[2-1]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section*{$\mathrm{e}^{\ln(3)}$}
+
+\setcounter{section}{3}
+\setcounter{subsection}{0}
+
+\lipsum[1-2]
+
+\subsection{Subsection $\mathrm{e}^{\ln(3)}$.1, with looo\-ooooooooong title}
+\lipsum[2-5]
+
+\subsection{$\mathbb{S}$ubsection Three.Two, another long title}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The $x \to \infty$ End}
+
+\lipsum[2-2]
+
+\end{multicols}
+
+\end{document}
diff --git a/spec/files/hardmode.toc b/spec/files/hardmode.toc
new file mode 100644
index 0000000000000000000000000000000000000000..aa7398bc42cffb1f98290f1f1d73c5d243f8a342
--- /dev/null
+++ b/spec/files/hardmode.toc
@@ -0,0 +1,8 @@
+"1 Section One" 1
+"2 Section 1 + 1 = 2" 1
+    "2.1 Subsection Two.One" 1
+"e ln(3)" 2
+    "3.1 Subsection e ln(3) .1, with looo- ooooooooong title" 2
+    "3.2 S ubsection Three.Two, another long title" 3
+    "3.3 Subsection Three.Three" 3
+"4 The x → ∞ End" 3
diff --git a/spec/files/hardmode_recipe.toml b/spec/files/hardmode_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..4f67312a7be3dd4dcb5de533a0f413b8b2536e7d
--- /dev/null
+++ b/spec/files/hardmode_recipe.toml
@@ -0,0 +1,18 @@
+[[heading]]
+level = 1
+greedy = true
+font.name = "TeXGyrePagellaX-Bold"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 1
+greedy = true
+font.name = "CMR10"
+font.size = 9.962599754333496
+font.superscript = true
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "TeXGyrePagellaX-Bold"
+font.size = 11.9552001953125
diff --git a/spec/files/hastoc.pdf b/spec/files/hastoc.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..2e518d118710edac73ccf3d2c56faadc0aae8e3f
Binary files /dev/null and b/spec/files/hastoc.pdf differ
diff --git a/spec/files/hastoc.tex b/spec/files/hastoc.tex
new file mode 100644
index 0000000000000000000000000000000000000000..1b1f62a47cd02343eeee77ca0549aed8fd6ff789
--- /dev/null
+++ b/spec/files/hastoc.tex
@@ -0,0 +1,42 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+\usepackage{hyperref}
+
+\title{2 Level Heading Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\lipsum[2-4]
+
+\section{Section Two}
+
+\lipsum[2-5]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section{Section Three, with looong loooong looong title}
+
+\lipsum[1-2]
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+\lipsum[2-5]
+
+\subsection{Subsection Three.Two}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The End}
+
+\lipsum[2-5]
+
+\end{document}
diff --git a/spec/files/hastoc.toc b/spec/files/hastoc.toc
new file mode 100644
index 0000000000000000000000000000000000000000..b1916bbcb3700c1205acbe5e999b4b59b8b4a300
--- /dev/null
+++ b/spec/files/hastoc.toc
@@ -0,0 +1,8 @@
+"Section One" 1
+"Section Two" 1
+    "Subsection Two.One" 2
+"Section Three, with looong loooong looong title" 3
+    "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3
+    "Subsection Three.Two" 4
+    "Subsection Three.Three" 5
+"The End" 5
diff --git a/spec/files/hastoc_v.toc b/spec/files/hastoc_v.toc
new file mode 100644
index 0000000000000000000000000000000000000000..5b82fb6f6cd404540ce48102ae32336e63d78e34
--- /dev/null
+++ b/spec/files/hastoc_v.toc
@@ -0,0 +1,8 @@
+"Section One" 1 234.65998
+"Section Two" 1 562.148
+    "Subsection Two.One" 2 449.522
+"Section Three, with looong loooong looong title" 3 330.333
+    "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3 616.444
+    "Subsection Three.Two" 4 509.298
+    "Subsection Three.Three" 5 124.802
+"The End" 5 361.387
diff --git a/spec/files/level2.pdf b/spec/files/level2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..1abfde38371e99a859bc831c240170470fade538
Binary files /dev/null and b/spec/files/level2.pdf differ
diff --git a/spec/files/level2.tex b/spec/files/level2.tex
new file mode 100644
index 0000000000000000000000000000000000000000..9457c2ab681ecff3e0b9ba4b18fd2341819108e3
--- /dev/null
+++ b/spec/files/level2.tex
@@ -0,0 +1,41 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+
+\title{2 Level Heading Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\lipsum[2-4]
+
+\section{Section Two}
+
+\lipsum[2-5]
+
+\subsection{Subsection Two.One}
+\lipsum[2-5]
+
+\section{Section Three, with looong loooong looong title}
+
+\lipsum[1-2]
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+\lipsum[2-5]
+
+\subsection{Subsection Three.Two}
+\lipsum[1-1]
+
+\subsection{Subsection Three.Three}
+\lipsum[2-3]
+
+\section{The End}
+
+\lipsum[2-5]
+
+\end{document}
diff --git a/spec/files/level2.toc b/spec/files/level2.toc
new file mode 100644
index 0000000000000000000000000000000000000000..ba9e8df8f2a7a3c97d7386b2ac7ed359eb4ca8e7
--- /dev/null
+++ b/spec/files/level2.toc
@@ -0,0 +1,8 @@
+"1 Section One" 1
+"2 Section Two" 1
+    "2.1 Subsection Two.One" 2
+"3 Section Three, with looong loooong looong ti- tle" 3
+    "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 3
+    "3.2 Subsection Three.Two" 4
+    "3.3 Subsection Three.Three" 5
+"4 The End" 5
diff --git a/spec/files/level2_h.toc b/spec/files/level2_h.toc
new file mode 100644
index 0000000000000000000000000000000000000000..cd65d7d2cdd2fb36a70a633ec42632d5fb0402a0
--- /dev/null
+++ b/spec/files/level2_h.toc
@@ -0,0 +1,8 @@
+1 Section One ··· 1
+2 Section Two ··· 1
+    2.1 Subsection Two.One ··· 2
+3 Section Three, with looong loooong looong ti- tle ··· 3
+    3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more ··· 3
+    3.2 Subsection Three.Two ··· 4
+    3.3 Subsection Three.Three ··· 5
+4 The End ··· 5
diff --git a/spec/files/level2_meta b/spec/files/level2_meta
new file mode 100644
index 0000000000000000000000000000000000000000..488e41923de2458207a2fedcb56a1092498d9149
--- /dev/null
+++ b/spec/files/level2_meta
@@ -0,0 +1,26 @@
+Section One:
+    font.name = "CMBX12"
+    font.size = 14.346199989318848
+    font.color = 0x000000
+    font.superscript = false
+    font.italic = false
+    font.serif = true
+    font.monospace = false
+    font.bold = true
+    bbox.left = 157.98439025878906
+    bbox.top = 237.6484375
+    bbox.right = 243.12905883789062
+    bbox.bottom = 252.00897216796875
+Section Two:
+    font.name = "CMBX12"
+    font.size = 14.346199989318848
+    font.color = 0x000000
+    font.superscript = false
+    font.italic = false
+    font.serif = true
+    font.monospace = false
+    font.bold = true
+    bbox.left = 157.98439025878906
+    bbox.top = 567.3842163085938
+    bbox.right = 245.18057250976562
+    bbox.bottom = 581.7447509765625
diff --git a/spec/files/level2_meta.toml b/spec/files/level2_meta.toml
new file mode 100644
index 0000000000000000000000000000000000000000..e7886a91f71667fecbdba97f62f5a5bfc605d710
--- /dev/null
+++ b/spec/files/level2_meta.toml
@@ -0,0 +1,38 @@
+[[heading]]
+# Section One
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+# font.size_tolerance = 1e-5
+# font.color = 0x000000
+# font.superscript = false
+# font.italic = false
+# font.serif = true
+# font.monospace = false
+# font.bold = true
+# bbox.left = 157.98439025878906
+# bbox.top = 237.6484375
+# bbox.right = 243.12905883789062
+# bbox.bottom = 252.00897216796875
+# bbox.tolerance = 1e-5
+
+[[heading]]
+# Section Two
+level = 1
+greedy = true
+font.name = "CMBX12"
+font.size = 14.346199989318848
+# font.size_tolerance = 1e-5
+# font.color = 0x000000
+# font.superscript = false
+# font.italic = false
+# font.serif = true
+# font.monospace = false
+# font.bold = true
+# bbox.left = 157.98439025878906
+# bbox.top = 567.3842163085938
+# bbox.right = 245.18057250976562
+# bbox.bottom = 581.7447509765625
+# bbox.tolerance = 1e-5
+
diff --git a/spec/files/level2_recipe.toml b/spec/files/level2_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..9a69ff7864e4bc658fd44bff50f01735337ff17a
--- /dev/null
+++ b/spec/files/level2_recipe.toml
@@ -0,0 +1,9 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+font.name = "CMBX12"
+font.size = 11.9552001953125
diff --git a/spec/files/onepage.pdf b/spec/files/onepage.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..4e4bfb3d1a90cf9d35cee459af93cb5677307eb5
Binary files /dev/null and b/spec/files/onepage.pdf differ
diff --git a/spec/files/onepage.tex b/spec/files/onepage.tex
new file mode 100644
index 0000000000000000000000000000000000000000..224b3e62a53af1ea1f4148c88e96a15dfd3d2b5b
--- /dev/null
+++ b/spec/files/onepage.tex
@@ -0,0 +1,37 @@
+\documentclass{article}
+
+\usepackage{lipsum}
+
+\title{One page Test}
+\author{krasjet}
+\date{}
+
+\begin{document}
+\maketitle
+
+\section{Section One}
+
+\section{Section Two}
+
+\subsection{Subsection Two.One}
+\subsection{Subsection Two.Two $\times 2$}
+
+\section{Section Three, with looong loooong looong title}
+
+\subsection{Subsection Three.One, with even loooooooooooonger title, and
+probably even more}
+
+\subsection{Subsection Three.Two}
+
+\subsection{Subsection Three.Three}
+\subsubsection{Subsubsection Three.Three.One}
+\subsubsection{Subsubsection Three.Three.Two}
+\subsubsection{Subsubsection Three.Three.Three}
+
+\subsection{Subsection Three.Four}
+
+\subsection{Subsection Three.Five}
+
+\section{The End}
+
+\end{document}
diff --git a/spec/files/onepage.toc b/spec/files/onepage.toc
new file mode 100644
index 0000000000000000000000000000000000000000..cab90366a9af8cc9ee4368806f84ead29dd76b19
--- /dev/null
+++ b/spec/files/onepage.toc
@@ -0,0 +1,14 @@
+"1 Section One" 1
+"2 Section Two" 1
+    "2.1 Subsection Two.One" 1
+    "2.2 Subsection Two.Two × 2" 1
+"3 Section Three, with looong loooong looong ti- tle" 1
+    "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 1
+    "3.2 Subsection Three.Two" 1
+    "3.3 Subsection Three.Three" 1
+        "3.3.1 Subsubsection Three.Three.One" 1
+        "3.3.2 Subsubsection Three.Three.Two" 1
+        "3.3.3 Subsubsection Three.Three.Three" 1
+    "3.4 Subsection Three.Four" 1
+    "3.5 Subsection Three.Five" 1
+"4 The End" 1
diff --git a/spec/files/onepage_greedy.toml b/spec/files/onepage_greedy.toml
new file mode 100644
index 0000000000000000000000000000000000000000..ff645f2f8ab30042c52e9f90c5b1e9189ff5c2d3
--- /dev/null
+++ b/spec/files/onepage_greedy.toml
@@ -0,0 +1,15 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+greedy = true
+font.name = "CMBX12"
+font.size = 11.9552001953125
+
+[[heading]]
+level = 3
+font.name = "CMBX10"
+font.size = 9.962599754333496
diff --git a/spec/files/onepage_recipe.toml b/spec/files/onepage_recipe.toml
new file mode 100644
index 0000000000000000000000000000000000000000..1602f87bf67e0667012e11fe6f49e15f20e1ae2e
--- /dev/null
+++ b/spec/files/onepage_recipe.toml
@@ -0,0 +1,14 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+
+[[heading]]
+level = 2
+font.name = "(CMBX12|CMSY10|CMR12)"
+font.size = 11.9552001953125
+
+[[heading]]
+level = 3
+font.name = "CMBX10"
+font.size = 9.962599754333496
diff --git a/spec/files/recipe_spec.toml b/spec/files/recipe_spec.toml
new file mode 100644
index 0000000000000000000000000000000000000000..fc9bbe76e2bbd0ac0f9780ed5f0c79011f213c1e
--- /dev/null
+++ b/spec/files/recipe_spec.toml
@@ -0,0 +1,33 @@
+[[heading]]
+level = 1
+font.name = "CMBX12"
+font.size = 14.346199989318848
+font.size_tolerance = 1e-5
+font.color = 0x000000
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
+bbox.left = 157.98439025878906
+bbox.top = 335.569580078125
+bbox.right = 477.66058349609375
+bbox.bottom = 349.93011474609375
+bbox.tolerance = 1e-5
+
+[[heading]]
+level = 2
+font.name = "CMBX10"
+font.size = 9.962599754333496
+font.size_tolerance = 1e-5
+font.color = 0x000000
+font.superscript = false
+font.italic = false
+font.serif = true
+font.monospace = false
+font.bold = true
+bbox.left = 168.76663208007812
+bbox.top = 127.2930679321289
+bbox.right = 280.66656494140625
+bbox.bottom = 137.2556610107422
+bbox.tolerance = 1e-5
diff --git a/spec/filter_spec.py b/spec/filter_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac02003c36bccd9b573e487f0915fb7383017df4
--- /dev/null
+++ b/spec/filter_spec.py
@@ -0,0 +1,642 @@
+import os
+
+from mamba import description, it, before
+from pdftocgen.filter import (
+    ToCFilter,
+    admits_float,
+    FontFilter,
+    BoundingBoxFilter
+)
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("admits_float") as self:
+    with it("admits if difference is below tol"):
+        assert admits_float(1, 1.05, 0.1)
+        assert admits_float(1, 0.95, 0.1)
+
+    with it("does not admit if difference is too large"):
+        assert not admits_float(1, 1.5, 0.1)
+        assert not admits_float(1, 0.5, 0.1)
+
+    with it("admits anything if expect is unset"):
+        assert admits_float(None, 1, 0.1)
+        assert admits_float(None, None, 0.1)
+
+    with it("does not admit if expect is set but actual is None"):
+        assert not admits_float(1, None, 0.1)
+
+with description("ToCFilter") as self:
+    with before.all:
+        self.title_exact = {
+            'level': 1,
+            'font': {
+                'name': "CMBX12",
+                'size': 14.346199989318848,
+                'size_tolerance': 0,
+                'color': 0,
+                'superscript': False,
+                'italic': False,
+                'serif': True,
+                'monospace': False,
+                'bold': True
+            },
+            'bbox': {
+                'left': 157.98439025878906,
+                'top': 567.3842163085938,
+                'right': 245.18057250976562,
+                'bottom': 581.7447509765625,
+                'tolerance': 0
+            }
+        }
+
+        self.text_exact = {
+            'level': 2,
+            'font': {
+                'name': "CMR10",
+                'size': 9.962599754333496,
+                'size_tolerance': 0,
+                'color': 0,
+                'superscript': False,
+                'italic': False,
+                'serif': True,
+                'monospace': False,
+                'bold': False
+            },
+            'bbox': {
+                'left': 133.76800537109375,
+                'top': 592.492919921875,
+                'right': 477.537353515625,
+                'bottom': 602.4555053710938,
+                'tolerance': 0
+            }
+        }
+
+        self.spn_title = {
+            'size': 14.346199989318848,
+            'flags': 20,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0,
+            'text': 'Section Two',
+            'bbox': (157.98439025878906,
+                     567.3842163085938,
+                     245.18057250976562,
+                     581.7447509765625)
+        }
+
+        self.spn_text = {
+            'size': 9.962599754333496,
+            'flags': 4,
+            'font': 'MJDLZY+CMR10',
+            'color': 0,
+            'text': 'text',
+            'bbox': (133.76800537109375,
+                     592.492919921875,
+                     477.537353515625,
+                     602.4555053710938)
+        }
+
+    with it("raises error if no toc level is specified"):
+        try:
+            fltr = ToCFilter({})
+        except ValueError:
+            pass
+        except:
+            assert False, "must raise error"
+
+    with it("raises error if toc level is invalid"):
+        try:
+            fltr = ToCFilter({'level': 0})
+            fltr = ToCFilter({'level': -1})
+        except ValueError:
+            pass
+        except:
+            assert False, "must raise error"
+
+    with it("does not raise error if toc level is valid"):
+        try:
+            fltr = ToCFilter({'level': 1})
+            fltr = ToCFilter({'level': 2})
+        except ValueError:
+            assert False, "must not raise error"
+
+    with it("admits exact matches"):
+        filter_title = ToCFilter(self.title_exact)
+        filter_text = ToCFilter(self.text_exact)
+        assert filter_title.admits(self.spn_title)
+        assert filter_text.admits(self.spn_text)
+
+    with it("rejects unmatched spans"):
+        filter_title = ToCFilter(self.title_exact)
+        filter_text = ToCFilter(self.text_exact)
+        assert not filter_title.admits(self.spn_text)
+        assert not filter_text.admits(self.spn_title)
+
+    with it("admits correctly without bbox"):
+        filter_title = ToCFilter({
+            'level': 1,
+            'font': {
+                'name': "CMBX12",
+            }
+        })
+        assert filter_title.admits(self.spn_title)
+
+        filter_text = ToCFilter({
+            'level': 2,
+            'font': {
+                'size': 9.962599754333496,
+            }
+        })
+        assert filter_text.admits(self.spn_text)
+
+    with it("rejects correctly without bbox"):
+        filter_title = ToCFilter({
+            'level': 1,
+            'font': {
+                'name': "CMBX12",
+            }
+        })
+        assert not filter_title.admits(self.spn_text)
+
+        filter_text = ToCFilter({
+            'level': 2,
+            'font': {
+                'size': 9.962599754333496,
+            }
+        })
+        assert not filter_text.admits(self.spn_title)
+
+    with it("admits correctly without font"):
+        filter_title = ToCFilter({
+            'level': 1,
+            'bbox': {
+                'left': 157.98439025878906,
+            }
+        })
+        assert filter_title.admits(self.spn_title)
+
+        filter_text = ToCFilter({
+            'level': 2,
+            'bbox': {
+                'top': 592.492919921875,
+            }
+        })
+        assert filter_text.admits(self.spn_text)
+
+    with it("rejects correctly without font"):
+        filter_title = ToCFilter({
+            'level': 1,
+            'bbox': {
+                'left': 157.98439025878906,
+            }
+        })
+        assert not filter_title.admits(self.spn_text)
+
+        filter_text = ToCFilter({
+            'level': 2,
+            'bbox': {
+                'top': 592.492919921875,
+            }
+        })
+        assert not filter_text.admits(self.spn_title)
+
+
+with description("FontFilter") as self:
+    with before.all:
+        self.title_exact = {
+            'name': "CMBX12",
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        }
+
+        self.text_exact = {
+            'name': "CMR10",
+            'size': 9.962599754333496,
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': False
+        }
+
+        self.spn_title = {
+            'size': 14.346199989318848,
+            'flags': 20,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0,
+            'text': 'Section Two',
+            'bbox': (157.98439025878906,
+                     567.3842163085938,
+                     245.18057250976562,
+                     581.7447509765625)
+        }
+
+        self.spn_small_title = {
+            'size': 9.962599754333496,
+            'flags': 4,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0,
+            'text': 'text',
+            'bbox': (133.76800537109375,
+                     592.492919921875,
+                     477.537353515625,
+                     602.4555053710938)
+        }
+
+        self.spn_text = {
+            'size': 9.962599754333496,
+            'flags': 4,
+            'font': 'MJDLZY+CMR10',
+            'color': 0,
+            'text': 'text',
+            'bbox': (133.76800537109375,
+                     592.492919921875,
+                     477.537353515625,
+                     602.4555053710938)
+        }
+
+    with it("has a working constructor"):
+        fnt = FontFilter(self.title_exact)
+        assert fnt.name.search("TZOLRB+CMBX12")
+        assert fnt.name.search("CMBX12")
+        assert not fnt.name.search("CMBX10")
+        assert fnt.flags == 0b10100
+        assert fnt.ign_mask == 0b11111
+        assert fnt.color == 0x000000
+        assert fnt.size == 14.346199989318848
+        assert fnt.size_tolerance == 0
+
+    with it("can construct if empty dict is given in the constructor"):
+        fnt = FontFilter({})
+        assert fnt.name.search("anything")
+        assert fnt.flags == 0
+        assert fnt.ign_mask == 0
+        assert fnt.color is None
+        assert fnt.size is None
+        assert fnt.size_tolerance == 1e-5
+
+    with it("admits exact matches"):
+        fnt_title = FontFilter(self.title_exact)
+        fnt_text = FontFilter(self.text_exact)
+        assert fnt_title.admits(self.spn_title)
+        assert fnt_text.admits(self.spn_text)
+
+    with it("rejects unmatched spans"):
+        fnt_title = FontFilter(self.title_exact)
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+        fnt_text = FontFilter(self.text_exact)
+        assert not fnt_text.admits(self.spn_title)
+        assert not fnt_text.admits(self.spn_small_title)
+
+    with it("admits correctly without font name"):
+        fnt_title = FontFilter({
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly without font name"):
+        fnt_title = FontFilter({
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly with only font name"):
+        fnt_title = FontFilter({
+            'name': "CMBX12"
+        })
+        assert fnt_title.admits(self.spn_title)
+        assert fnt_title.admits(self.spn_small_title)
+
+    with it("rejects correctly with only font name"):
+        fnt_title = FontFilter({
+            'name': "CMBX12"
+        })
+        assert not fnt_title.admits(self.spn_text)
+
+    with it("admits correctly without size"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly without size"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size_tolerance': 0,
+            'color': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly with only size"):
+        fnt_title = FontFilter({
+            'size': 14.346199989318848,
+            'size_tolerance': 0
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly with only size"):
+        fnt_title = FontFilter({
+            'size': 14.346199989318848,
+            'size_tolerance': 0
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly without color"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly without color"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly with only color"):
+        fnt_title = FontFilter({
+            'color': 0x000000,
+        })
+        assert fnt_title.admits(self.spn_title)
+        assert fnt_title.admits(self.spn_text)
+        assert fnt_title.admits(self.spn_small_title)
+
+    with it("rejects correctly with only color"):
+        fnt_title = FontFilter({
+            'color': 0x000000,
+        })
+        spn_blue = {
+            'size': 14.346199989318848,
+            'flags': 20,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0x0000ff,
+            'text': 'Section Two',
+            'bbox': (157.98439025878906,
+                     567.3842163085938,
+                     245.18057250976562,
+                     581.7447509765625)
+        }
+        assert not fnt_title.admits(spn_blue)
+
+    with it("admits correctly with only flags"):
+        fnt_title = FontFilter({
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly with only flags"):
+        fnt_title = FontFilter({
+            'superscript': False,
+            'italic': False,
+            'serif': True,
+            'monospace': False,
+            'bold': True
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly without flags"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'color': 0,
+        })
+        assert fnt_title.admits(self.spn_title)
+
+    with it("rejects correctly without flags"):
+        fnt_title = FontFilter({
+            'name': "CMBX12",
+            'size': 14.346199989318848,
+            'size_tolerance': 0,
+            'color': 0,
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+
+    with it("admits correctly with partial flags"):
+        fnt_title = FontFilter({
+            'serif': True,
+            'bold': True
+        })
+        fnt_serif = FontFilter({
+            'serif': True
+        })
+        fnt_sans = FontFilter({
+            'serif': False
+        })
+        fnt_mono = FontFilter({
+            'monospace': True
+        })
+        assert fnt_title.admits(self.spn_title)
+        assert fnt_serif.admits(self.spn_title)
+        assert fnt_serif.admits(self.spn_text)
+        assert fnt_sans.admits({'flags': 0b11011})
+        assert fnt_mono.admits({'flags': 0b11111})
+
+    with it("rejects correctly with partial flags"):
+        fnt_title = FontFilter({
+            'serif': True,
+            'bold': True
+        })
+        fnt_serif = FontFilter({
+            'serif': True
+        })
+        fnt_sans = FontFilter({
+            'serif': False
+        })
+        fnt_mono = FontFilter({
+            'monospace': True
+        })
+        assert not fnt_title.admits(self.spn_text)
+        assert not fnt_title.admits(self.spn_small_title)
+        assert not fnt_sans.admits(self.spn_title)
+        assert not fnt_sans.admits(self.spn_text)
+        assert not fnt_mono.admits(self.spn_title)
+        assert not fnt_mono.admits(self.spn_text)
+
+
+with description("BoundingBoxFilter") as self:
+    with before.all:
+        self.title_exact = {
+            'left': 157.98439025878906,
+            'top': 567.3842163085938,
+            'right': 245.18057250976562,
+            'bottom': 581.7447509765625,
+            'tolerance': 0
+        }
+
+        self.text_exact = {
+            'left': 133.76800537109375,
+            'top': 592.492919921875,
+            'right': 477.537353515625,
+            'bottom': 602.4555053710938,
+            'tolerance': 0
+        }
+
+        self.spn_title = {
+            'size': 14.346199989318848,
+            'flags': 20,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0,
+            'text': 'Section Two',
+            'bbox': (157.98439025878906,
+                     567.3842163085938,
+                     245.18057250976562,
+                     581.7447509765625)
+        }
+
+        self.spn_title2 = {
+            'size': 14.346199989318848,
+            'flags': 20,
+            'font': 'TZOLRB+CMBX12',
+            'color': 0,
+            'text': 'Section One',
+            'bbox': (157.98439025878906,
+                     335.569580078125,
+                     477.66058349609375,
+                     349.93011474609375)
+        }
+
+        self.spn_text = {
+            'size': 9.962599754333496,
+            'flags': 4,
+            'font': 'MJDLZY+CMR10',
+            'color': 0,
+            'text': 'text',
+            'bbox': (133.76800537109375,
+                     592.492919921875,
+                     477.537353515625,
+                     602.4555053710938)
+        }
+    with it("has a working constructor"):
+        bbox = BoundingBoxFilter(self.title_exact)
+        assert bbox.left is not None
+        assert bbox.right is not None
+        assert bbox.top is not None
+        assert bbox.bottom is not None
+        assert bbox.tolerance == 0
+
+    with it("can construct if empty dict is given in the constructor"):
+        bbox = BoundingBoxFilter({})
+        assert bbox.left is None
+        assert bbox.right is None
+        assert bbox.top is None
+        assert bbox.bottom is None
+        assert bbox.tolerance == 1e-5
+
+    with it("admits exact matches"):
+        bbox_title = BoundingBoxFilter(self.title_exact)
+        bbox_text = BoundingBoxFilter(self.text_exact)
+        assert bbox_title.admits(self.spn_title)
+        assert bbox_text.admits(self.spn_text)
+
+    with it("rejects unmatched spans"):
+        bbox_title = BoundingBoxFilter(self.title_exact)
+        assert not bbox_title.admits(self.spn_text)
+        assert not bbox_title.admits(self.spn_title2)
+
+        bbox_text = BoundingBoxFilter(self.text_exact)
+        assert not bbox_text.admits(self.spn_title)
+        assert not bbox_text.admits(self.spn_title2)
+
+    with it("admits correctly with partial bbox"):
+        bbox_title = BoundingBoxFilter({
+            'left': 157.98439025878906
+        })
+        assert bbox_title.admits(self.spn_title)
+        assert bbox_title.admits(self.spn_title2)
+
+        bbox_top = BoundingBoxFilter({
+            'top': 567.3842163085938
+        })
+        assert bbox_top.admits(self.spn_title)
+
+        bbox_right = BoundingBoxFilter({
+            'right': 245.18057250976562
+        })
+        assert bbox_right.admits(self.spn_title)
+
+        bbox_bottom = BoundingBoxFilter({
+            'bottom': 581.7447509765625
+        })
+        assert bbox_bottom.admits(self.spn_title)
+
+    with it("rejects correctly with partial bbox"):
+        bbox_title = BoundingBoxFilter({
+            'left': 157.98439025878906
+        })
+        assert not bbox_title.admits(self.spn_text)
+
+        bbox_top = BoundingBoxFilter({
+            'top': 567.3842163085938
+        })
+        assert not bbox_top.admits(self.spn_title2)
+
+        bbox_right = BoundingBoxFilter({
+            'right': 245.18057250976562
+        })
+        assert not bbox_right.admits(self.spn_title2)
+
+        bbox_bottom = BoundingBoxFilter({
+            'bottom': 581.7447509765625
+        })
+        assert not bbox_bottom.admits(self.spn_title2)
diff --git a/spec/fitzutils_spec.py b/spec/fitzutils_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb271c74bacbb6cc3d062cebcdafd3fe6671d28
--- /dev/null
+++ b/spec/fitzutils_spec.py
@@ -0,0 +1,101 @@
+import os
+import io
+
+from mamba import description, it, before
+from fitzutils import (
+    open_pdf,
+    ToCEntry,
+    dump_toc
+)
+from pdftocio.tocparser import parse_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+valid_file = os.path.join(dirpath, "files/level2.pdf")
+invalid_file = os.path.join(dirpath, "files/nothing.pdf")
+
+with description("open_pdf:") as self:
+    with it("opens pdf file for reading"):
+        with open_pdf(valid_file, False) as doc:
+            assert doc is not None
+            assert doc.page_count == 6
+
+    with it("returns None if pdf file is invalid"):
+        with open_pdf(invalid_file, False) as doc:
+            assert doc is None
+
+    with it("exits if pdf file is invalid and exit_on_error is true"):
+        try:
+            with open_pdf(invalid_file, True) as doc:
+                assert False, "should have exited"
+        except AssertionError as err:
+            raise err
+        except:
+            pass
+
+with description("ToCEntry") as self:
+    with it("matches fitz's representation"):
+        fitz_entry = [1, "title", 2]
+        fitz_entry2 = [1, "title", 2, 100.0]
+
+        toc_entry = ToCEntry(level=1, title="title", pagenum=2)
+        toc_entry2 = ToCEntry(level=1, title="title", pagenum=2, vpos=100.0)
+
+        assert toc_entry.to_fitz_entry() == fitz_entry
+        assert toc_entry2.to_fitz_entry() == fitz_entry2
+
+        assert ToCEntry(*fitz_entry) == toc_entry
+        assert ToCEntry(*fitz_entry2) == toc_entry2
+
+    with it("is sorted correctly"):
+        entries = [
+            ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0),
+            ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0),
+            ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0),
+            ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0),
+            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+            ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+        ]
+
+        expected = [
+            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+            ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0),
+            ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0),
+            ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0),
+            ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0),
+            ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+        ]
+        assert sorted(entries, key=ToCEntry.key) == expected
+
+
+with description("dump_toc") as self:
+    with before.all:
+        self.toc = [
+            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+            ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0),
+            ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0),
+            ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0),
+            ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0),
+            ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+        ]
+
+        self.toc_novpos = [
+            ToCEntry(level=1, title="title1", pagenum=1),
+            ToCEntry(level=2, title="title2", pagenum=1),
+            ToCEntry(level=3, title="title3", pagenum=2),
+            ToCEntry(level=2, title="title4", pagenum=2),
+            ToCEntry(level=2, title="title5", pagenum=3),
+            ToCEntry(level=1, title="title6", pagenum=5)
+        ]
+
+    with it("won't print vpos if vpos is False"):
+        toc_s = dump_toc(self.toc, False)
+        f = io.StringIO(toc_s)
+        assert parse_toc(f) == self.toc_novpos
+        assert parse_toc(f) != self.toc
+
+    with it("won't print vpos if vpos is missing"):
+        toc_s = dump_toc(self.toc_novpos, True)
+        f = io.StringIO(toc_s)
+        assert parse_toc(f) == self.toc_novpos
+        assert parse_toc(f) != self.toc
diff --git a/spec/parser_spec.py b/spec/parser_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..548998adef037079f0b0a3044725985d00482be7
--- /dev/null
+++ b/spec/parser_spec.py
@@ -0,0 +1,65 @@
+import os
+import io
+
+from mamba import description, it, before
+from fitzutils import (
+    dump_toc,
+    ToCEntry
+)
+from pdftocio.tocparser import parse_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+valid_file = os.path.join(dirpath, "files/level2.pdf")
+invalid_file = os.path.join(dirpath, "files/nothing.pdf")
+
+with description("parse_toc") as self:
+    with before.all:
+        self.toc = [
+            ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0),
+            ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0),
+            ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0),
+            ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0),
+            ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0),
+            ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0)
+        ]
+
+        self.toc_novpos = [
+            ToCEntry(level=1, title="title1", pagenum=1),
+            ToCEntry(level=2, title="title2", pagenum=1),
+            ToCEntry(level=3, title="title3", pagenum=2),
+            ToCEntry(level=2, title="title4", pagenum=2),
+            ToCEntry(level=2, title="title5", pagenum=3),
+            ToCEntry(level=1, title="title6", pagenum=5)
+        ]
+
+
+    with it("can recover the result from dump_toc"):
+        toc_s = dump_toc(self.toc, True)
+        f = io.StringIO(toc_s)
+        assert parse_toc(f) == self.toc
+        assert parse_toc(f) != self.toc_novpos
+
+        toc_s = dump_toc(self.toc_novpos, False)
+        f = io.StringIO(toc_s)
+        assert parse_toc(f) == self.toc_novpos
+        assert parse_toc(f) != self.toc
+
+    with it("escapes quotations correctly"):
+        quoted = '"a ""quoted"" title" 2\n    "a single \'quoted\' title" 4'
+        expect = [
+            ToCEntry(level=1, title='a "quoted" title', pagenum=2),
+            ToCEntry(level=2, title="a single 'quoted' title", pagenum=4)
+        ]
+        f = io.StringIO(quoted)
+        assert parse_toc(f) == expect
+
+    with it("raises error when toc entry is invalid"):
+        malformed = '"entry" 1\n    "error entry"'
+        f = io.StringIO(malformed)
+        try:
+            parse_toc(f)
+        except IndexError:
+            pass
+        else:
+            assert False, "must raise error"
diff --git a/spec/tocgen_spec.py b/spec/tocgen_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..aade8b67f47d66d0bf3913e473adca9a1702ac8c
--- /dev/null
+++ b/spec/tocgen_spec.py
@@ -0,0 +1,159 @@
+import os
+import fitz
+import toml
+
+from mamba import description, it, before
+from fitzutils import ToCEntry
+from pdftocgen.tocgen import gen_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("gen_toc") as self:
+    with before.all:
+        self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+        self.level2_recipe = toml.load(
+            open(os.path.join(dirpath, "files/level2_recipe.toml"))
+        )
+        self.level2_expect = [
+            ToCEntry(level=1, title='1 Section One',
+                     pagenum=1, vpos=237.6484375),
+            ToCEntry(level=1, title='2 Section Two',
+                     pagenum=1, vpos=567.3842163085938),
+            ToCEntry(level=2, title='2.1 Subsection Two.One',
+                     pagenum=2, vpos=452.56671142578125),
+            ToCEntry(level=1,
+                     title='3 Section Three, with looong loooong looong ti- tle',
+                     pagenum=3, vpos=335.569580078125),
+            ToCEntry(level=2, title='3.1 Subsection Three.One, '
+                     'with even loooooooooooonger title, and probably even more',
+                     pagenum=3, vpos=619.4886474609375),
+            ToCEntry(level=2, title='3.2 Subsection Three.Two',
+                     pagenum=4, vpos=512.3426513671875),
+            ToCEntry(level=2, title='3.3 Subsection Three.Three',
+                     pagenum=5, vpos=125.79861450195312),
+            ToCEntry(level=1, title='4 The End',
+                     pagenum=5, vpos=366.62347412109375)
+        ]
+
+        self.onepage = fitz.open(os.path.join(dirpath, "files/onepage.pdf"))
+        self.onepage_recipe = toml.load(
+            open(os.path.join(dirpath, "files/onepage_recipe.toml"))
+        )
+        self.onepage_greedy = toml.load(
+            open(os.path.join(dirpath, "files/onepage_greedy.toml"))
+        )
+        self.onepage_expect = [
+            # false positive, but easy to remove in post-processing
+            ToCEntry(level=2, title='krasjet',
+                     pagenum=1, vpos=196.53366088867188),
+            ToCEntry(level=1, title='1 Section One',
+                     pagenum=1, vpos=237.6484375),
+            ToCEntry(level=1, title='2 Section Two',
+                     pagenum=1, vpos=265.44744873046875),
+            ToCEntry(level=2, title='2.1 Subsection Two.One',
+                     pagenum=1, vpos=291.0536804199219),
+            ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
+                     pagenum=1, vpos=311.1368103027344),
+            ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
+                     pagenum=1, vpos=334.00946044921875),
+            ToCEntry(level=2, title='3.1 Subsection Three.One, '
+                     'with even loooooooooooonger title, and probably even more',
+                     pagenum=1, vpos=377.5487060546875),
+            ToCEntry(level=2, title='3.2 Subsection Three.Two',
+                     pagenum=1, vpos=411.8786926269531),
+            ToCEntry(level=2, title='3.3 Subsection Three.Three',
+                     pagenum=1, vpos=432.26068115234375),
+            ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
+                     pagenum=1, vpos=452.1441345214844),
+            ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
+                     pagenum=1, vpos=470.53314208984375),
+            ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
+                     pagenum=1, vpos=488.9231262207031),
+            ToCEntry(level=2, title='3.4 Subsection Three.Four',
+                     pagenum=1, vpos=507.8106994628906),
+            ToCEntry(level=2, title='3.5 Subsection Three.Five',
+                     pagenum=1, vpos=528.191650390625),
+            ToCEntry(level=1, title='4 The End',
+                     pagenum=1, vpos=550.7654418945312)
+        ]
+
+        self.onepage_greedy_expect = [
+            # hooray, no more false positives
+            ToCEntry(level=1, title='1 Section One',
+                     pagenum=1, vpos=237.6484375),
+            ToCEntry(level=1, title='2 Section Two',
+                     pagenum=1, vpos=265.44744873046875),
+            ToCEntry(level=2, title='2.1 Subsection Two.One',
+                     pagenum=1, vpos=291.0536804199219),
+            ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2',
+                     pagenum=1, vpos=311.1368103027344),
+            ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle',
+                     pagenum=1, vpos=334.00946044921875),
+            ToCEntry(level=2, title='3.1 Subsection Three.One, '
+                     'with even loooooooooooonger title, and probably even more',
+                     pagenum=1, vpos=377.5487060546875),
+            ToCEntry(level=2, title='3.2 Subsection Three.Two',
+                     pagenum=1, vpos=411.8786926269531),
+            ToCEntry(level=2, title='3.3 Subsection Three.Three',
+                     pagenum=1, vpos=432.26068115234375),
+            ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One',
+                     pagenum=1, vpos=452.1441345214844),
+            ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two',
+                     pagenum=1, vpos=470.53314208984375),
+            ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three',
+                     pagenum=1, vpos=488.9231262207031),
+            ToCEntry(level=2, title='3.4 Subsection Three.Four',
+                     pagenum=1, vpos=507.8106994628906),
+            ToCEntry(level=2, title='3.5 Subsection Three.Five',
+                     pagenum=1, vpos=528.191650390625),
+            ToCEntry(level=1, title='4 The End',
+                     pagenum=1, vpos=550.7654418945312)
+        ]
+
+        self.hardmode = fitz.open(os.path.join(dirpath, "files/hardmode.pdf"))
+        self.hardmode_recipe = toml.load(
+            open(os.path.join(dirpath, "files/hardmode_recipe.toml"))
+        )
+
+        self.hardmode_expect = [
+            ToCEntry(level=1, title='1 Section One',
+                     pagenum=1, vpos=174.1232452392578),
+            ToCEntry(level=1, title='2 Section 1 + 1 = 2',
+                     pagenum=1, vpos=584.5831909179688),
+            ToCEntry(level=2, title='2.1 Subsection Two.One',
+                     pagenum=1, vpos=425.2061462402344),
+            ToCEntry(level=1, title='e ln(3)',
+                     pagenum=2, vpos=516.01708984375),
+            ToCEntry(level=2, title='3.1 Subsection e ln(3) .1, '
+                     'with looo- ooooooooong title',
+                     pagenum=2, vpos=302.5021057128906),
+            ToCEntry(level=2, title='3.2 S ubsection Three.Two, another long title',
+                     pagenum=3, vpos=396.212158203125),
+            ToCEntry(level=2, title='3.3 Subsection Three.Three',
+                     pagenum=3, vpos=68.84815979003906),
+            ToCEntry(level=1, title='4 The x → ∞ End',
+                     pagenum=3, vpos=483.49920654296875)
+        ]
+
+    with it("generates 2-level toc correctly"):
+        assert gen_toc(self.level2, self.level2_recipe) == self.level2_expect
+
+    with it("handles headings on same page correctly"):
+        assert gen_toc(
+            self.onepage, self.onepage_recipe
+        ) == self.onepage_expect
+
+    with it("handles math in heading correctly"):
+        assert gen_toc(
+            self.onepage, self.onepage_recipe
+        ) == self.onepage_expect
+
+    with it("handles greedy filter correctly"):
+        assert gen_toc(
+            self.onepage, self.onepage_greedy
+        ) == self.onepage_greedy_expect
+
+    with it("passes the HARD MODE"):
+        assert gen_toc(
+            self.hardmode, self.hardmode_recipe
+        ) == self.hardmode_expect
diff --git a/spec/tocio_spec.py b/spec/tocio_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4fff163c0a3e8f49f3832ef1c2e588d57b5120
--- /dev/null
+++ b/spec/tocio_spec.py
@@ -0,0 +1,81 @@
+import os
+import fitz
+
+from mamba import description, it, before
+from fitzutils import ToCEntry
+from pdftocio.tocio import read_toc, write_toc
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+level2 = os.path.join(dirpath, "files/level2.pdf")
+hastoc = os.path.join(dirpath, "files/hastoc.pdf")
+
+with description("read_toc") as self:
+    with before.all:
+        self.doc = fitz.open(level2)
+        self.reference = fitz.open(hastoc)
+        self.expect = [
+            ToCEntry(level=1, title='Section One', pagenum=1, vpos=234.65998),
+            ToCEntry(level=1, title='Section Two', pagenum=1, vpos=562.148),
+            ToCEntry(level=2, title='Subsection Two.One', pagenum=2, vpos=449.522),
+            ToCEntry(level=1,
+                     title='Section Three, with looong loooong looong title',
+                     pagenum=3,
+                     vpos=330.333),
+            ToCEntry(level=2,
+                     title='Subsection Three.One, '
+                     'with even loooooooooooonger title, and probably even more',
+                     pagenum=3,
+                     vpos=616.444),
+            ToCEntry(level=2, title='Subsection Three.Two',
+                     pagenum=4, vpos=509.298),
+            ToCEntry(level=2, title='Subsection Three.Three',
+                     pagenum=5, vpos=124.802),
+            ToCEntry(level=1, title='The End', pagenum=5, vpos=361.387)
+        ]
+
+    with it("reads pdf toc correctly"):
+        assert self.expect == read_toc(self.reference)
+
+    with it("makes (read_toc -> write_toc -> read_toc) an identity operation (except vpos)"):
+        toc = read_toc(self.reference)
+        write_toc(self.doc, toc)
+        toc2 = read_toc(self.doc)
+
+        assert len(toc2) == len(toc)
+        for e1, e2 in zip(toc, toc2):
+            assert e1.level == e2.level
+            assert e1.title == e2.title
+            assert e1.pagenum == e2.pagenum
+
+with description("write_toc") as self:
+    with before.all:
+        self.doc = fitz.open(level2)
+        self.reference = fitz.open(hastoc)
+        self.toc = [
+            ToCEntry(level=1, title='Section One', pagenum=1),
+            ToCEntry(level=1, title='Section Two', pagenum=1),
+            ToCEntry(level=2, title='Subsection Two.One', pagenum=2),
+            ToCEntry(level=1,
+                     title='Section Three, with looong loooong looong title',
+                     pagenum=3),
+            ToCEntry(level=2,
+                     title='Subsection Three.One, '
+                     'with even loooooooooooonger title, and probably even more',
+                     pagenum=3),
+            ToCEntry(level=2, title='Subsection Three.Two',
+                     pagenum=4),
+            ToCEntry(level=2, title='Subsection Three.Three',
+                     pagenum=5),
+            ToCEntry(level=1, title='The End', pagenum=5)
+        ]
+
+    with it("makes (write_toc -> read_toc) an identity operation (except vpos)"):
+        write_toc(self.doc, self.toc)
+        toc2 = read_toc(self.doc)
+
+        assert len(toc2) == len(self.toc)
+        for e1, e2 in zip(self.toc, toc2):
+            assert e1.level == e2.level
+            assert e1.title == e2.title
+            assert e1.pagenum == e2.pagenum
diff --git a/spec/xmeta_spec.py b/spec/xmeta_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1db81351c40a86ae8c5060dbf47177cd96d6f97
--- /dev/null
+++ b/spec/xmeta_spec.py
@@ -0,0 +1,188 @@
+import os
+import fitz
+import toml
+
+from mamba import description, it, before
+from pdfxmeta import extract_meta, dump_meta, dump_toml
+
+dirpath = os.path.dirname(os.path.abspath(__file__))
+
+with description("extract_meta:") as self:
+    with before.all:
+        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+
+    with it("extracts metadata from pdf"):
+        meta = extract_meta(self.doc, "Section One", 1)
+        assert len(meta) == 1
+
+        m = meta[0]
+        assert m['text'] == "Section One"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+    with it("matches lowercase when ignore case is set"):
+        meta = extract_meta(self.doc, "section one", 1, True)
+        assert len(meta) == 1
+
+        m = meta[0]
+        assert m['text'] == "Section One"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+    with it("matches mixed case when ignore case is set"):
+        meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True)
+        assert len(meta) == 1
+
+        m = meta[0]
+        assert m['text'] == "Section One"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+    with it("matches nothing if ignore case is not set"):
+        meta = extract_meta(self.doc, "section one", 1, False)
+        assert len(meta) == 0
+
+    with it("can match multiple instances of needle"):
+        meta = extract_meta(self.doc, "Section", 1)
+        assert len(meta) == 2
+
+        m = meta[0]
+        assert m['text'] == "Section One"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+        m = meta[1]
+        assert m['text'] == "Section Two"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+    with it("returns [] when nothing is matched"):
+        meta = extract_meta(self.doc, "Sectoin", 1, False)
+        assert len(meta) == 0
+
+    with it("returns [] when page number is out of range"):
+        meta = extract_meta(self.doc, "Section One", 0)
+        assert len(meta) == 0
+
+        meta = extract_meta(self.doc, "Section One", 7)
+        assert len(meta) == 0
+
+    with it("can match text on any page when page number is not specified"):
+        meta = extract_meta(self.doc, "The End")
+        assert len(meta) == 1
+
+        m = meta[0]
+        assert m['text'] == "The End"
+        assert 'font' in m
+        assert 'CMBX12' in m['font']
+
+with description("dump_meta:") as self:
+    with before.all:
+        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+        self.expected_meta = {
+            'font': {
+                'name': 'CMBX12',
+                'size': 14.346199989318848,
+                'color': 0x000000,
+                'superscript': False,
+                'italic': False,
+                'serif': True,
+                'monospace': False,
+                'bold': True
+            },
+            'bbox': {
+                'left': 157.98439025878906,
+                'top': 237.6484375,
+                'right': 243.12905883789062,
+                'bottom': 252.00897216796875
+            }
+        }
+
+    with it("produces valid toml"):
+        meta = extract_meta(self.doc, "Section One", 1)
+        assert len(meta) == 1
+
+        meta_dict = toml.loads(dump_meta(meta[0]))
+        assert meta_dict == self.expected_meta
+
+
+with description("dump_toml:") as self:
+    with before.all:
+        self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf"))
+        self.expected_recipe = {
+            'heading': [
+                {
+                    'level': 1,
+                    'greedy': True,
+                    'font': {
+                        'name': 'CMBX12',
+                        'size': 14.346199989318848,
+                    }
+                }
+            ]
+        }
+
+    with it("produces valid toml"):
+        meta = extract_meta(self.doc, "Section One", 1)
+        assert len(meta) == 1
+
+        meta_dict = toml.loads(dump_toml(meta[0], 1))
+        assert meta_dict == self.expected_recipe
+
+    with it("strips font subset correctly"):
+        with_subset = {
+            'font': "subset+font",
+            'size': 1,
+            'flags': 20,
+            'color': 0,
+            'bbox': (1, 2, 3, 4),
+            'text': ""
+        }
+
+        without_subset = {
+            'font': "font",
+            'size': 1,
+            'flags': 20,
+            'color': 0,
+            'bbox': (1, 2, 3, 4),
+            'text': ""
+        }
+
+        expected = {
+            'heading': [
+                {
+                    'level': 1,
+                    'greedy': True,
+                    'font': {
+                        'name': 'font',
+                        'size': 1
+                    }
+                }
+            ]
+        }
+
+        double_plus = {
+            'font': "subset+font+font",
+            'size': 1,
+            'flags': 20,
+            'color': 0,
+            'bbox': (1, 2, 3, 4),
+            'text': ""
+        }
+
+        expected2 = {
+            'heading': [
+                {
+                    'level': 1,
+                    'greedy': True,
+                    'font': {
+                        'name': 'font+font',
+                        'size': 1
+                    }
+                }
+            ]
+        }
+
+        assert toml.loads(dump_toml(with_subset, 1)) == expected
+        assert toml.loads(dump_toml(without_subset, 1)) == expected
+        assert toml.loads(dump_toml(double_plus, 1)) == expected2
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/utils/find_by_font.py b/utils/find_by_font.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a411f3f315203e1fed8f3184e7987942162aea
--- /dev/null
+++ b/utils/find_by_font.py
@@ -0,0 +1,41 @@
+import sys
+import fitz  # PyMuPDF
+import math
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python find_by_font.py <input.pdf> <font_name> [font_size]")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    target_font = sys.argv[2]
+    target_size = float(sys.argv[3]) if len(sys.argv) > 3 else None
+    
+    doc = fitz.open(pdf_path)
+    
+    print(f"Searching for:")
+    print(f"  Font: '{target_font}'")
+    print(f"  Size: {target_size if target_size else 'ANY'}")
+    print("-" * 60)
+    print(f"{'PAGE':<6} {'SIZE':<8} {'TEXT'}")
+    print("-" * 60)
+
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:
+            if "lines" not in b: continue
+            for l in b["lines"]:
+                for s in l["spans"]:
+                    # Check Font Name (partial match or exact?)
+                    # Let's do partial match to be friendly
+                    if target_font.lower() in s["font"].lower():
+                        
+                        # Check Size (with tolerance)
+                        if target_size:
+                            if not math.isclose(s["size"], target_size, rel_tol=1e-2):
+                                continue
+                        
+                        print(f"{page.number + 1:<6} {s['size']:<8.2f} '{s['text']}'")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/find_preceding.py b/utils/find_preceding.py
new file mode 100644
index 0000000000000000000000000000000000000000..7208bd2e02e9fa5eee171e7a59c1432202edf5b9
--- /dev/null
+++ b/utils/find_preceding.py
@@ -0,0 +1,60 @@
+import sys
+import fitz  # PyMuPDF
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python find_preceding.py <input.pdf> \"<anchor_text>\"")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    anchor = sys.argv[2]
+    
+    doc = fitz.open(pdf_path)
+    prev_span = None
+    prev_page_num = -1
+    
+    found = False
+    
+    print(f"Searching for anchor containing: '{anchor}'")
+    
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:
+            if "lines" not in b: continue
+            for l in b["lines"]:
+                for s in l["spans"]:
+                    current_text = s["text"]
+                    
+                    if anchor in current_text:
+                        print(f"\n[!] MATCH FOUND on Page {page.number + 1}")
+                        print(f"    Anchor Span Text: '{current_text.strip()}'")
+                        
+                        if prev_span:
+                            print(f"\n--- PRECEDING ELEMENT DETAILS ---")
+                            print(f"Text:   '{prev_span['text']}'")
+                            print(f"Page:   {prev_page_num}")
+                            print(f"Font:   {prev_span['font']}")
+                            print(f"Size:   {prev_span['size']:.4f}")
+                            print(f"Color:  {hex(prev_span['color'])}")
+                            print(f"BBox:   {prev_span['bbox']}")
+                            print(f"Flags:  {prev_span['flags']}")
+                            
+                            # Helper for recipe creation
+                            print(f"\n--- SUGGESTED RECIPE FILTER ---")
+                            print(f"[[heading]]")
+                            print(f"font.name = \"{prev_span['font']}\"")
+                            print(f"font.size = {prev_span['size']}")
+                        else:
+                            print("\n[!] No preceding text element found (this might be the first element).")
+                        
+                        found = True
+                    
+                    # Update tracker
+                    prev_span = s
+                    prev_page_num = page.number + 1
+    
+    if not found:
+        print(f"\nAnchor text '{anchor}' not found in document.")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/inspect_bytes.py b/utils/inspect_bytes.py
new file mode 100644
index 0000000000000000000000000000000000000000..72d95cd0ec5d68661b3031542cea18c11fc3c43e
--- /dev/null
+++ b/utils/inspect_bytes.py
@@ -0,0 +1,32 @@
+import sys
+import fitz  # PyMuPDF
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python inspect_bytes.py <input.pdf> \"<search_string>\"")
+        sys.exit(1)
+
+    pdf_path = sys.argv[1]
+    search_str = sys.argv[2]
+    
+    doc = fitz.open(pdf_path)
+    
+    print(f"Searching for string containing: '{search_str}'")
+    print("-" * 60)
+    
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        for b in blocks:
+            if "lines" not in b: continue
+            for l in b["lines"]:
+                for s in l["spans"]:
+                    text = s["text"]
+                    if search_str in text:
+                        print(f"Page {page.number + 1}:")
+                        print(f"  Visual: '{text}'")
+                        print(f"  Raw:    {ascii(text)}")
+                        print(f"  Hex:    { [hex(ord(c)) for c in text] }")
+                        print("-" * 20)
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/list_longest_fonts.py b/utils/list_longest_fonts.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb320760495297b1ef4f9c01f6cb78f1653dd3c7
--- /dev/null
+++ b/utils/list_longest_fonts.py
@@ -0,0 +1,54 @@
+import sys
+import fitz  # PyMuPDF
+import heapq
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python list_largest_fonts.py <input.pdf>")
+        sys.exit(1)
+
+    doc = fitz.open(sys.argv[1])
+    
+    # We will keep a heap of the top N largest items
+    # Storing tuples of (size, page_num, text, font_name)
+    # We use a list and sort it at the end for simplicity since N is small
+    candidates = []
+
+    print(f"Scanning {len(doc)} pages...")
+
+    for page in doc:
+        blocks = page.get_text("dict")["blocks"]
+        label = page.get_label()
+        for b in blocks:
+            if "lines" in b:
+                for l in b["lines"]:
+                    for s in l["spans"]:
+                        text = s["text"].strip()
+                        if not text:
+                            continue
+                        
+                        # Add to candidates
+                        candidates.append({
+                            "size": s["size"],
+                            "text": text[:50], # Truncate for display
+                            "page": page.number + 1,
+                            "label": label,
+                            "font": s["font"]
+                        })
+
+    # Sort descending by size
+    candidates.sort(key=lambda x: x["size"], reverse=True)
+
+    # Deduplicate based on (size, font) to avoid spamming the same header style
+    # But we want to see different text instances. 
+    # Let's just show top 20 raw entries.
+    
+    print(f"\n--- TOP 25 LARGEST TEXT SPANS ---")
+    print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}")
+    print("-" * 75)
+    
+    for c in candidates[:25]:
+        print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/modify_toc.py b/utils/modify_toc.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb3e221e65885c71db647eb702e5fd5baebbd13
--- /dev/null
+++ b/utils/modify_toc.py
@@ -0,0 +1,61 @@
+import sys
+import re
+import io
+
+def clean_text(text):
+    # Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
+    # Also collapses multiple spaces
+    text = text.replace('\xa0', ' ').replace('\xad', '')
+    # Replace en-dash and em-dash with standard hyphen
+    text = text.replace('\u2013', '-').replace('\u2014', '-')
+    # Remove control characters (except allowed ones, though likely not needed for titles)
+    text = "".join(ch for ch in text if ch.isprintable())
+    return ' '.join(text.split())
+
+def main():
+    # Force UTF-8 for stdin/stdout to handle special characters on Windows
+    # otherwise it defaults to cp1252/cp437 which mangles unicode
+    stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace')
+    stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
+
+    # Regex to match ToC lines
+    # Captures: 
+    # 1. Indentation (leading spaces)
+    # 2. Title (inside quotes)
+    # 3. Page Number
+    # 4. Trailing content (like vpos)
+    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
+    
+    idx = 0
+    
+    for line in stdin:
+        # Strip newline for processing
+        line_content = line.rstrip('\n')
+        if not line_content:
+            stdout.write("\n")
+            continue
+            
+        match = pattern.match(line_content)
+        if match:
+            indent = match.group(1)
+            old_title = match.group(2)
+            page_num = match.group(3)
+            trailing = match.group(4)
+            
+            # Sanitize the title (fix weird spaces/hyphens)
+            cleaned_title = clean_text(old_title)
+            
+            # Format: 000_Title_pgX
+            new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}"
+            
+            # Reconstruct the line
+            new_line = f'{indent}"{new_title}" {page_num}{trailing}'
+            
+            stdout.write(new_line + "\n")
+            idx += 1
+        else:
+            # If line doesn't match expected format, print as is
+            stdout.write(line_content + "\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/split_by_toc.py b/utils/split_by_toc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b77e30a573db6f973a2295bfc69085cdeb453d79
--- /dev/null
+++ b/utils/split_by_toc.py
@@ -0,0 +1,91 @@
+import fitz  # PyMuPDF
+import sys
+import os
+import re
+
+def main():
+    if len(sys.argv) < 3:
+        print("Usage: python split_by_toc.py <input.pdf> <input.toc> [output_dir]")
+        sys.exit(1)
+
+    # Force UTF-8 for stdout/stderr
+    sys.stdout.reconfigure(encoding='utf-8')
+    sys.stderr.reconfigure(encoding='utf-8')
+
+    pdf_path = sys.argv[1]
+    toc_path = sys.argv[2]
+    output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    print(f"Splitting '{pdf_path}' based on '{toc_path}'...")
+    
+    # 1. Parse ToC
+    # We need a list of (Title, StartPage)
+    entries = []
+    # Regex to match modify_toc style output: "Title" Page ...
+    # Also matches standard pdftocgen style
+    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
+    
+    with open(toc_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if not line.strip(): continue
+            match = pattern.match(line)
+            if match:
+                title = match.group(2)
+                page = int(match.group(3))
+                entries.append((title, page))
+    
+    if not entries:
+        print("Error: No ToC entries found.")
+        sys.exit(1)
+
+    # 2. Open PDF
+    doc = fitz.open(pdf_path)
+    total_pages = doc.page_count
+    
+    print(f"Total Pages: {total_pages}")
+    print(f"Found {len(entries)} chapters.")
+    print("-" * 40)
+
+    # 3. Iterate and Split
+    for i, (title, start_page) in enumerate(entries):
+        # PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually
+        # But wait, pdftocgen output is 1-based visual page numbers.
+        # So StartIndex = start_page - 1
+        
+        start_idx = start_page - 1
+        
+        # Determine End Page
+        if i < len(entries) - 1:
+            next_start_page = entries[i+1][1]
+            end_idx = next_start_page - 1 - 1 # One page before next chapter
+        else:
+            end_idx = total_pages - 1
+            
+        # Safety check for weird overlaps or empty ranges
+        if start_idx > end_idx:
+            # Maybe bookmarks are out of order or on same page
+            # Just grab the single page
+            end_idx = start_idx
+        
+        filename = f"{title}.pdf"
+        # Sanitize filename (remove forbidden chars like slash, colon)
+        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
+        out_path = os.path.join(output_dir, filename)
+        
+        print(f"[{i+1}/{len(entries)}] {title}")
+        print(f"  Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})")
+        
+        # Create new PDF for this chapter
+        new_doc = fitz.open()
+        new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
+        new_doc.save(out_path)
+        new_doc.close()
+
+    print("-" * 40)
+    print(f"Done! Files saved to '{output_dir}/'")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/split_pdf.py b/utils/split_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..28a614369d6d040284c04e484e07dd4af62433cf
--- /dev/null
+++ b/utils/split_pdf.py
@@ -0,0 +1,72 @@
+import fitz
+import sys
+import os
+import re
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python split_pdf.py <input_with_bookmarks.pdf> [output_dir]")
+        sys.exit(1)
+
+    # Force UTF-8 for stdout/stderr
+    sys.stdout.reconfigure(encoding='utf-8')
+    sys.stderr.reconfigure(encoding='utf-8')
+
+    pdf_path = sys.argv[1]
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output"
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    print(f"Reading bookmarks from '{pdf_path}'...")
+    
+    doc = fitz.open(pdf_path)
+    toc = doc.get_toc() # [[lvl, title, page_num, ...], ...]
+    
+    if not toc:
+        print("Error: No bookmarks found in this PDF.")
+        sys.exit(1)
+
+    # Filter for Level 1 bookmarks (Top-level chapters)
+    chapters = [entry for entry in toc if entry[0] == 1]
+    
+    print(f"Found {len(chapters)} top-level chapters.")
+    
+    total_pages = doc.page_count
+    
+    for i, (lvl, title, start_page, *_) in enumerate(chapters):
+        # Calculate End Page
+        # Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1? 
+        # Standard logic: Chapter 1 ends where Chapter 2 begins.
+        
+        # We need the index of this entry in the full TOC to find the next meaningful boundary
+        # But simpler: The next Level 1 defines the end of this Level 1 block.
+        
+        start_idx = start_page - 1
+        
+        if i < len(chapters) - 1:
+            next_start_page = chapters[i+1][2]
+            end_idx = next_start_page - 1 - 1
+        else:
+            end_idx = total_pages - 1
+            
+        # Sanity check
+        if end_idx < start_idx:
+            end_idx = start_idx
+
+        filename = f"{title}.pdf"
+        # Sanitize
+        filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip()
+        out_path = os.path.join(output_dir, filename)
+
+        print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})")
+        
+        new_doc = fitz.open()
+        new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
+        new_doc.save(out_path)
+        new_doc.close()
+
+    print(f"Done. Files saved to {output_dir}/")
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/test_toc_processor.py b/utils/test_toc_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8fbbd9b9cfaf9984c46db5e6943642929b4092e
--- /dev/null
+++ b/utils/test_toc_processor.py
@@ -0,0 +1,46 @@
+import unittest
+from toc_processor import merge_same_page_headers, clean_text, parse_raw_toc_output
+
+class TestTOCProcessor(unittest.TestCase):
+    
+    def test_merge_same_page_headers(self):
+        # Scenario: "American Government..." (Page 31) followed by "Divided World" (Page 31)
+        input_toc = [
+            [1, "Chapter 1 Intro", 5],
+            [1, "American Government and Politics in a Racially", 31],
+            [1, "Divided World", 31], 
+            [1, "Chapter 2", 57]
+        ]
+        
+        expected_toc = [
+            [1, "Chapter 1 Intro", 5],
+            [1, "American Government and Politics in a Racially Divided World", 31],
+            [1, "Chapter 2", 57]
+        ]
+        
+        result = merge_same_page_headers(input_toc)
+        
+        print(f"\nInput: {[e[1] for e in input_toc]}")
+        print(f"Result: {[e[1] for e in result]}")
+        
+        self.assertEqual(len(result), 3)
+        self.assertEqual(result[1][1], "American Government and Politics in a Racially Divided World")
+        self.assertEqual(result[1][2], 31)
+        
+    def test_merge_same_page_headers_mixed_levels(self):
+        # Scenario: Level 1 followed by Level 2 on same page (Should NOT merge)
+        input_toc = [
+            [1, "Chapter 1", 10],
+            [2, "Section 1.1", 10]
+        ]
+        
+        result = merge_same_page_headers(input_toc)
+        self.assertEqual(len(result), 2)
+        
+    def test_clean_text(self):
+        dirty = "Hello\xa0World\xad"
+        clean = clean_text(dirty)
+        self.assertEqual(clean, "Hello World")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/utils/toc_processor.py b/utils/toc_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf546df948375312943a31e4c3b7855dedd1c15
--- /dev/null
+++ b/utils/toc_processor.py
@@ -0,0 +1,242 @@
+"""
+TOC Processor
+-------------
+Handles operations related to the Table of Contents (TOC) for the PDF pipeline.
+Includes functionality for:
+- Cleaning and sanitizing text (encoding issues, soft hyphens)
+- Merging usage-heuristic headers (e.g. multi-line headers on same page)
+- generating split PDF chapters
+"""
+
+import re
+import io
+import zipfile
+import fitz  # PyMuPDF
+from typing import List, Tuple, Generator, Optional
+import tempfile
+import os
+
+# Type alias for TOC entry: [level, title, page, ...]
+FitZTOCEntry = list
+
+def clean_text(text: str) -> str:
+    """
+    Sanitize text to remove common PDF artifacts.
+    Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes.
+    """
+    if not text:
+        return ""
+        
+    # Replace non-breaking spaces (\xa0) and soft hyphens (\xad)
+    text = text.replace('\xa0', ' ').replace('\xad', '')
+    
+    # Replace en-dash and em-dash with standard hyphen
+    text = text.replace('\u2013', '-').replace('\u2014', '-')
+    
+    # Remove control characters (except allowed ones)
+    text = "".join(ch for ch in text if ch.isprintable())
+    
+    return ' '.join(text.split())
+
+def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]:
+    """
+    Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list.
+    Expected format lines: '    "Chapter Title" 123'
+    """
+    toc = []
+    # Regex captures: 1=Indent, 2=Title, 3=PageNum
+    pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$')
+    
+    for line in raw_output.splitlines():
+        match = pattern.match(line)
+        if match:
+            indent, title, page_str, _ = match.groups()
+            
+            # Calculate level based on indentation (4 spaces = 1 indent step)
+            # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc.
+            # pdftocgen defaults to standard indentation
+            level = (len(indent) // 4) + 1
+            page = int(page_str)
+            
+            toc.append([level, title, page])
+            
+    return toc
+
+def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]:
+    """
+    Detects consecutive Level 1 headers derived from the same page and merges them.
+    This fixes the "double split" issue where multi-line headers are detected as separate entries.
+    
+    Example:
+        Input:  [[1, "Title Part 1", 10], [1, "Title Part 2", 10]]
+        Output: [[1, "Title Part 1 Title Part 2", 10]]
+    """
+    if not toc:
+        return []
+
+    merged_toc = []
+    
+    for entry in toc:
+        level, title, page = entry[0], entry[1], entry[2]
+        
+        # We only care about merging Level 1 headers
+        if level != 1:
+            merged_toc.append(entry)
+            continue
+            
+        # Check if we can merge with the previous entry
+        if merged_toc:
+            prev_entry = merged_toc[-1]
+            prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2]
+            
+            # CRITERIA: Both Level 1, Same Page
+            if prev_level == 1 and prev_page == page:
+                # Merge! Update the previous entry's title
+                new_title = f"{prev_title} {title}"
+                merged_toc[-1][1] = new_title
+                continue
+        
+        # If no merge, append as new
+        merged_toc.append(entry)
+        
+    return merged_toc
+
+def process_toc(raw_toc_content: str) -> str:
+    """
+    Full pipeline to clean and format raw TOC content.
+    Returns the string content formatted for `pdftocio` input (with indices).
+    """
+    # 1. Parse
+    parsed_toc = parse_raw_toc_output(raw_toc_content)
+    
+    # 2. Clean Titles
+    for entry in parsed_toc:
+        entry[1] = clean_text(entry[1])
+        
+    # 3. Merge Same-Page Headers (The Double Split Fix)
+    merged_toc = merge_same_page_headers(parsed_toc)
+    
+    # 4. Format for Output (re-serialize)
+    # pdftocio expects: "Title" PageNum
+    # DECOUPLED: We keep the PDF bookmarks clean (no number prefix).
+    # File naming handling is moved to generate_chapter_splits.
+    
+    output_lines = []
+    
+    for entry in merged_toc:
+        level, title, page = entry[0], entry[1], entry[2]
+        
+        # Indent: 4 spaces per level minus 1
+        indent = " " * (4 * (level - 1))
+        output_lines.append(f'{indent}"{title}" {page}')
+        
+    return "\n".join(output_lines)
+
+def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None):
+    """
+    Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path.
+    Uses tempfile logic to handle large files safely.
+    
+    Args:
+        input_pdf_path: Path to source PDF
+        output_zip_path: Path to write the ZIP
+        back_matter_start_page: 1-based page number where Back Matter starts. 
+                                Chapters will be clamped to end before this page.
+                                Content from this page to end will be saved as 999_Back_Matter.pdf.
+    """
+    doc = fitz.open(input_pdf_path)
+    toc = doc.get_toc()
+    
+    if not toc:
+        doc.close()
+        raise ValueError("No Table of Contents found in the PDF.")
+        
+    # Create the zip file
+    with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
+        total_pages = doc.page_count
+        
+        # --- Front Matter Extraction ---
+        # Find the first Level 1 chapter
+        first_l1_page = None
+        for entry in toc:
+             if entry[0] == 1:
+                 first_l1_page = entry[2]
+                 break
+        
+        # If the first chapter starts after Page 1, extract Front Matter
+        if first_l1_page and first_l1_page > 1:
+            # Front matter is from page 0 to (first_l1_page - 1) - 1 (index)
+            fm_end_idx = first_l1_page - 2 
+            
+            if fm_end_idx >= 0:
+                fm_doc = fitz.open()
+                fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx)
+                zf.writestr("000_Front_matter.pdf", fm_doc.tobytes())
+                fm_doc.close()
+
+        # --- Chapter Extraction ---
+        chapter_idx = 1
+        
+        for i, entry in enumerate(toc):
+            level, title, start_page = entry[0], entry[1], entry[2]
+            
+            # We skip non-L1 for splitting functionality
+            if level != 1:
+                continue
+            
+            # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter)
+            if back_matter_start_page and start_page >= back_matter_start_page:
+                continue
+
+            start_idx = start_page - 1
+            
+            # Determine end page lookahead
+            end_page = total_pages
+            for next_entry in toc[i+1:]:
+                if next_entry[0] == 1:
+                    # The start of the next chapter is the end of this one
+                    end_page = next_entry[2] - 1 
+                    break
+            
+            # --- CLAMPING: Check against Back Matter ---
+            if back_matter_start_page:
+                # If the *natural* end of this chapter goes into back matter, cut it short.
+                # The cut point is back_matter_start_page - 1.
+                # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99.
+                if end_page >= back_matter_start_page:
+                    end_page = back_matter_start_page - 1
+            
+            end_idx = end_page - 1
+            
+            # Safety clamp
+            if end_idx < start_idx:
+                end_idx = start_idx
+                
+            # Create sub-document
+            new_doc = fitz.open()
+            new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx)
+            
+            # Sanitize filename
+            safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip()
+            if not safe_title:
+                safe_title = f"chapter_{chapter_idx}"
+                
+            # Formatting: 001_Title_pgX.pdf
+            pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf"
+            chapter_idx += 1
+            
+            # Write to zip
+            zf.writestr(pdf_name, new_doc.tobytes())
+            new_doc.close()
+            
+        # --- Back Matter Generation ---
+        if back_matter_start_page and back_matter_start_page <= total_pages:
+            bm_start_idx = back_matter_start_page - 1
+            bm_end_idx = total_pages - 1
+            
+            bm_doc = fitz.open()
+            bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx)
+            zf.writestr("999_Back_matter.pdf", bm_doc.tobytes())
+            bm_doc.close()
+            
+    doc.close()