diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..2a7eb07b981a460bd4c198fb917a30587a7d01e2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +spec/files/hardmode.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a9213a96c5f0c9ce6fa8304e0528da6d5b5f7125 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +dist +__pycache__ +*.egg-info +*.aux +*.dvi +*.fdb_latexmk +*.fls +*.log +*.out + +# User files +*.pdf +*.toc +recipe_debug.toml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000000000000000000000000000000000000..f4f7dc13b324932a8b6d4424cb5daf0b1ede749f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,120 @@ +Change log +========== + +pdf.tocgen 1.3.4 +---------------- + +Released November 25, 2023 + +- Add error messages for `--page` and invalid file +- Fix KeyError when extracting ToC from some PDFs with pdftocio + +pdf.tocgen 1.3.3 +---------------- + +Released April 21, 2023 + +- Fix outdated dependencies +- Add vpos output for pdftocio +- Type stability enhancements + +pdf.tocgen 1.3.2 +---------------- + +Released April 20, 2023 + +- Fix outdated build system + +pdf.tocgen 1.3.1 +---------------- + +Released April 20, 2023 + +- Fix file encoding problems on Windows + +pdf.tocgen 1.3.0 +---------------- + +Released November 10, 2021 + +- Fix deprecation warning from PyMuPDF + +pdf.tocgen 1.2.3 +---------------- + +Released January 7, 2021 + +- Compatibility with PyMuPDF 1.18.6 + +pdf.tocgen 1.2.2 +---------------- + +Released October 11, 2020 + +- Compatibility with Python 3.9 + +pdf.tocgen 1.2.1 +---------------- + +Released August 7, 2020 + +- Fix a typo in the help message of `pdftocgen`. + +pdf.tocgen 1.2.0 +---------------- + +Released August 7, 2020 + +- Swap out argparse in favor of getopt, which is much simpler and more + flexible. +- Now we could use `pdfxmeta doc.pdf` to dump an entire document, without the + empty pattern `""`. + +pdf.tocgen 1.1.3 +---------------- + +Released August 4, 2020 + +- Usefully complain when tocparser can't parse an entry + +pdf.tocgen 1.1.2 +---------------- + +Released August 3, 2020 + +- Add `--print` flag for `pdftocio` to force printing ToC. +- Add spec for cli commands. + +pdf.tocgen 1.1.1 +---------------- + +Released July 31, 2020 + +- Add a `--auto` option for `pdfxmeta` to output a valid heading filter directly. + +pdf.tocgen 1.1.0 +---------------- + +Released July 31, 2020 + +- Add a new option for a heading filter to be "greedy", which makes it extract + all the text in a block when at least one match occurs. This is extremely + useful for math-heavy documents. +- fixes the sorting problem with two column layout. + +pdf.tocgen 1.0.1 +---------------- + +Released July 29, 2020 + +- Update documentations +- Fix some linter warnings +- Fix unicode problem in tests +- Some prep work for the next major release + +pdf.tocgen 1.0.0 +---------------- + +Released July 28, 2020 + +- The first stable version diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..3877ae0a7ff6f94ac222fd704e112723db776114 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/LICENSE_AGPL b/LICENSE_AGPL new file mode 100644 index 0000000000000000000000000000000000000000..1468d07c88d6a48dae9360ed0094955b54370224 --- /dev/null +++ b/LICENSE_AGPL @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c2b5806d4fbd99f02a757abcfeb728c70dafebc6 --- /dev/null +++ b/Makefile @@ -0,0 +1,26 @@ +# As a workaround to [1], we will use a makefile instead +# [1]: https://github.com/python-poetry/poetry/issues/241 + +.PHONY: install test xmeta-demo lint + +test: # run tests + @poetry run mamba --format=documentation ./spec + @poetry run ./spec/cli_spec.sh + +lint: # run lint + @poetry run pylint ./spec ./pdfxmeta ./pdftocgen ./fitzutils ./pdftocio + +xmeta-demo: # a demo of pdfxmeta + @poetry run pdfxmeta ./spec/files/level2.pdf "Section" + +tocgen-demo: # a demo of tocgen + @poetry run pdftocgen ./spec/files/level2.pdf < ./recipes/default_latex.toml + +install: # set up non-dev dependencies + poetry install --no-dev + +dev: # set up dev dependencies + poetry install + +publish: test # publish package to pypi + poetry publish --build diff --git a/QUICK_START.md b/QUICK_START.md new file mode 100644 index 0000000000000000000000000000000000000000..ad0525b101289d221c0ba72cd2a6dd2612bf816a --- /dev/null +++ b/QUICK_START.md @@ -0,0 +1,62 @@ +# PDF ToC Generation Quick Start + +Optional: Run as App +```bash +streamlit run app.py +``` +This will open a local web page where you can upload a PDF, analyze fonts, and generate bookmarks with one click. + +### Find Header Candidates +If you don't know the font size/name of your chapters, this lists the top 25 largest text elements. +```bash +python utils/list_longest_fonts.py +``` +*Output: Font Name, Size, Physical Page, Logical Page Label.* + +### Find Header by Context +If you know a specific string (e.g., the first sentence of a chapter) but can't find the header itself, this finds the element *immediately preceding* that string. +```bash +python utils/find_preceding.py "known text string" +``` + +### Debug Text Artifacts +If your bookmarks have weird characters (e.g., `??`), use this to see the raw byte codes (looking for soft hyphens `\xad`, non-breaking spaces `\xa0`, etc.). +```bash +python utils/inspect_bytes.py "Problematic String" +``` + +--- + +## Recipe Generation (pdfxmeta) +Once you have identified the visual style of your headers (e.g., "Caslon 54pt"), you can inspect specific text or automatically generate recipe entries using `pdfxmeta`. + +### Inspect Font Details +To get the exact font name and size of a specific string (e.g., "Chapter 1"): +```bash +pdfxmeta input.pdf "Chapter 1" +``` +*Output will show `font.name`, `font.size`, etc.* + +### Auto-Generate Recipe Entry +To append a valid TOML filter directly to your recipe file (level 1 header): +```bash +pdfxmeta -a 1 input.pdf "Chapter 1" >> recipe.toml +``` + +--- + +## The Pipeline +Run the full extraction and generation pipeline. + +### Middleware: `modify_toc.py` +We use a custom Python script to: +1. **Sanitize Text**: Removes soft hyphens (`\xad`) and cleans encodings. +2. **Format Labels**: Renames bookmarks to `001_Title_pgX`. +3. **Fix Encoding**: Forces UTF-8 handling to prevent pipe corruption. + +### The Command +**Git Bash** is recommended to avoid PowerShell encoding issues. + +```bash +pdftocgen -r recipe.toml input.pdf | python utils/modify_toc.py | pdftocio -o output.pdf input.pdf +``` diff --git a/README b/README new file mode 100644 index 0000000000000000000000000000000000000000..d33278e0aa2685d91d56f4d45399e21d5d2afede --- /dev/null +++ b/README @@ -0,0 +1,214 @@ +pdf.tocgen +========== + + in.pdf + | + | + +----------------------+--------------------+ + | | | + V V V ++----------+ +-----------+ +----------+ +| | recipe | | ToC | | +| pdfxmeta +--------->| pdftocgen +-------->| pdftocio +---> out.pdf +| | | | | | ++----------+ +-----------+ +----------+ + +pdf.tocgen is a set of command-line tools for automatically +extracting and generating the table of contents (ToC) of a +PDF file. It uses the embedded font attributes and position +of headings to deduce the basic outline of a PDF file. + +It works best for PDF files produces from a TeX document +using pdftex (and its friends pdflatex, pdfxetex, etc.), but +it's designed to work with any *software-generated* PDF +files (i.e. you shouldn't expect it to work with scanned +PDFs). Some examples include troff/groff, Adobe InDesign, +Microsoft Word, and probably more. + +Please see the homepage [1] for a detailed introduction. + +Installation +------------ + +pdf.tocgen is written in Python 3. It is known to work with +Python 3.7 to 3.11 on Linux, Windows, and macOS (On BSDs, +you probably need to build PyMuPDF yourself). Use + + $ pip install -U pdf.tocgen + +to install the latest version systemwide. Alternatively, use +`pipx` or + + $ pip install -U --user pdf.tocgen + +to install it for the current user. I would recommend the +latter approach to avoid messing up the package manager on +your system. + +If you are using an Arch-based Linux distro, the package is +also available on AUR [8]. It can be installed using any AUR +helper, for example yay: + + $ yay -S pdf.tocgen + +Workflow +-------- + +The design of pdf.tocgen is influenced by the Unix philosophy [2]. +I intentionally separated pdf.tocgen to 3 separate programs. +They work together, but each of them is useful on their own. + +1. pdfxmeta: extract the metadata (font attributes, positions) + of headings to build a *recipe* file. +2. pdftocgen: generate a table of contents from the recipe. +3. pdftocio: import the table of contents to the PDF document. + +You should read the example [3] on the homepage for a proper +introduction, but the basic workflow follows like this. + +First, use pdfxmeta to search for the metadata of headings, +and generate *heading filters* using the automatic setting + + $ pdfxmeta -p page -a 1 in.pdf "Section" >> recipe.toml + $ pdfxmeta -p page -a 2 in.pdf "Subsection" >> recipe.toml + +Note that `page` needs to be replaced by the page number of +the search keyword. + +The output `recipe.toml` file would contain several heading +filters, each of which specifies the attribute of a heading +at a particular level should have. + +An example recipe file would look like this: + + [[heading]] + level = 1 + greedy = true + font.name = "Times-Bold" + font.size = 19.92530059814453 + + [[heading]] + level = 2 + greedy = true + font.name = "Times-Bold" + font.size = 11.9552001953125 + +Then pass the recipe to `pdftocgen` to generate a table of +contents, + + $ pdftocgen in.pdf < recipe.toml + "Preface" 5 + "Bottom-up Design" 5 + "Plan of the Book" 7 + "Examples" 9 + "Acknowledgements" 9 + "Contents" 11 + "The Extensible Language" 14 + "1.1 Design by Evolution" 14 + "1.2 Programming Bottom-Up" 16 + "1.3 Extensible Software" 18 + "1.4 Extending Lisp" 19 + "1.5 Why Lisp (or When)" 21 + "Functions" 22 + "2.1 Functions as Data" 22 + "2.2 Defining Functions" 23 + "2.3 Functional Arguments" 26 + "2.4 Functions as Properties" 28 + "2.5 Scope" 29 + "2.6 Closures" 30 + "2.7 Local Functions" 34 + "2.8 Tail-Recursion" 35 + "2.9 Compilation" 37 + "2.10 Functions from Lists" 40 + "Functional Programming" 41 + "3.1 Functional Design" 41 + "3.2 Imperative Outside-In" 46 + "3.3 Functional Interfaces" 48 + "3.4 Interactive Programming" 50 + [--snip--] + +which can be directly imported to the PDF file using +`pdftocio`, + + $ pdftocgen in.pdf < recipe.toml | pdftocio -o out.pdf in.pdf + +Or if you want to edit the table of contents before +importing it, + + $ pdftocgen in.pdf < recipe.toml > toc + $ vim toc # edit + $ pdftocio in.pdf < toc + +Each of the three programs has some extra functionalities. +Use the -h option to see all the options you could pass in. + +Development +----------- + +If you want to modify the source code or contribute anything, +first install poetry [4], which is a dependency and package +manager for Python used by pdf.tocgen. Then run + + $ poetry install + +in the root directory of this repository to set up +development dependencies. + +If you want to test the development version of pdf.tocgen, +use the `poetry run` command: + + $ poetry run pdfxmeta in.pdf "pattern" + +Alternatively, you could also use the + + $ poetry shell + +command to open up a virtual environment and run the +development version directly: + + (pdf.tocgen) $ pdfxmeta in.pdf "pattern" + +Before you send a patch or pull request, make sure the unit +test passes by running: + + $ make test + +GUI front end +------------- + +If you are a Emacs user, you could install Daniel Nicolai's +toc-mode [9] package as a GUI front end for pdf.tocgen, +though it offers many more functionalities, such as +extracting (printed) table of contents from a PDF file. Note +that it uses pdf.tocgen under the hood, so you still need to +install pdf.tocgen before using toc-mode as a front end for +pdf.tocgen. + +License +------- + +pdf.tocgen itself a is free software. The source code of +pdf.tocgen is licensed under the GNU GPLv3 license. However, +the recipes in the `recipes` directory is separately +licensed under the CC BY-NC-SA 4.0 License [7] to prevent +any commercial usage, and thus not included in the +distribution. + +pdf.tocgen is based on PyMuPDF [5], licensed under the GNU +GPLv3 license, which is again based on MuPDF [6], licensed +under the GNU AGPLv3 license. A copy of the AGPLv3 license +is included in the repository. + +If you want to make any derivatives based on this project, +please follow the terms of the GNU GPLv3 license. + + +[1]: https://krasjet.com/voice/pdf.tocgen/ +[2]: https://en.wikipedia.org/wiki/Unix_philosophy +[3]: https://krasjet.com/voice/pdf.tocgen/#a-worked-example +[4]: https://python-poetry.org/ +[5]: https://github.com/pymupdf/PyMuPDF +[6]: https://mupdf.com/docs/index.html +[7]: https://creativecommons.org/licenses/by-nc-sa/4.0/ +[8]: https://aur.archlinux.org/packages/pdf.tocgen/ +[9]: https://github.com/dalanicolai/toc-mode diff --git a/README.md b/README.md index 8311edac1387771581380451f604af19646ca8c1..19bb3b63b5704e3db46c547078f6291b4fac69d5 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,30 @@ ---- -title: Pdf.tocgen.split -emoji: 🚀 -colorFrom: red -colorTo: red -sdk: docker -app_port: 8501 -tags: -- streamlit -pinned: false -short_description: Split PDF by headings based on Krasjet pdf.tocgen -license: gpl-2.0 ---- - -# Welcome to Streamlit! - -Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart: - -If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community -forums](https://discuss.streamlit.io). +--- +title: PDF TOC Generator Split +emoji: 📑 +colorFrom: blue +colorTo: indigo +sdk: streamlit +sdk_version: 1.41.1 +app_file: app.py +pinned: false +license: agpl-3.0 +short_description: Generate PDF Table of Contents and Split Chapters +--- + +# PDF Table of Contents Generator (Split Edition) + +Based on [pdf.tocgen](https://github.com/Krasjet/pdf.tocgen). + +## Features +- **Analyze Fonts**: Automatically detect chapter headers by font size and style. +- **Search**: Find headers by text search (Case Sensitive option available). +- **Generate TOC**: Create a clickable PDF bookmark outline. +- **Split Chapters**: Export each chapter as a separate PDF in a ZIP file. +- **Front/Back Matter**: Automatically handle un-numbered front matter and user-defined back matter (Index, Glossary). + +## Usage +1. Upload a PDF. +2. Use "Scan & Generate" to find headers. +3. Configure the "Back Matter" start page if needed. +4. Run Pipeline. +5. Download the Bookmarked PDF or the Zipped Chapter Splits. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..0f9a98ba5df1d5ba05d922691738ff2e9776227c --- /dev/null +++ b/app.py @@ -0,0 +1,381 @@ +import streamlit as st +import pandas as pd +import fitz # PyMuPDF +import os +import subprocess +import tempfile +import sys +import toml +import shutil +import zipfile +import io + +# Ensure we can import from utils if needed +sys.path.append(os.path.dirname(__file__)) +from utils import toc_processor +from pdfxmeta import pdfxmeta + +st.set_page_config(page_title="PDF Bookmark Generator", layout="wide") + +st.title("PDF Table of Contents Generator") + +st.markdown(""" +**Upload a PDF**, analyze its fonts to find headers, and generate a clean Table of Contents. +""") + +uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") + +if uploaded_file is not None: + # We need to save the uploaded file to disk for the CLI tools to read it + # We'll use a permanent temp file for the session so we don't have to re-upload constantly + # But for cleanliness, we might want to put this in a temp dir too? + # For now, keeping the input file logic as is (tempfile), but we'll put OUTPUTS in a pure temp dir + + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: + tmp_pdf.write(uploaded_file.getvalue()) + input_pdf_path = tmp_pdf.name + + # --- State Management & Reset --- + # Check if a new file is uploaded + file_id = f"{uploaded_file.name}_{uploaded_file.size}" # Robust proxy for ID + if 'current_file_id' not in st.session_state: + st.session_state['current_file_id'] = None + + if st.session_state['current_file_id'] != file_id: + # NEW FILE DETECTED: Reset Pipeline State + keys_to_reset = ['final_pdf_bytes', 'final_zip_bytes', 'final_zip_name', 'search_matches', 'font_name', 'font_size'] + for k in keys_to_reset: + if k in st.session_state: + del st.session_state[k] + st.session_state['current_file_id'] = file_id + # st.toast(f"New file loaded: {uploaded_file.name}. State cleared.") + + st.success(f"Loaded: {uploaded_file.name}") + + # --- Data Source Selection --- + st.header("1. Source Selection") + source_mode = st.radio("Where should the bookmarks come from?", + ["Scan & Generate (Create New)", "Use Existing Bookmarks (Modify)"], + help="Choose 'Scan & Generate' to build new bookmarks from fonts. Choose 'Use Existing' to tidy up bookmarks already in the file.") + + # --- Analysis Section (Only for Generate) --- + if source_mode == "Scan & Generate (Create New)": + st.header("2. Analyze Fonts") + + if 'font_name' not in st.session_state: + st.session_state['font_name'] = '' + if 'font_size' not in st.session_state: + st.session_state['font_size'] = 18.0 + + tab1, tab2 = st.tabs(["Scan for Large Fonts", "Search by Text"]) + + with tab1: + if st.button("Find Header Candidates"): + with st.spinner("Scanning PDF for large fonts..."): + doc = fitz.open(input_pdf_path) + candidates = [] + for page in doc[:50]: + text_page = page.get_text("dict") + for block in text_page["blocks"]: + for line in block.get("lines", []): + for span in line["spans"]: + text = span["text"].strip() + if len(text) > 3: + candidates.append({ + "Text": text[:50], + "Font": span["font"], + "Size": round(span["size"], 2), + "Page": page.number + 1 + }) + doc.close() + if candidates: + df = pd.DataFrame(candidates) + summary = df.groupby(['Font', 'Size']).size().reset_index(name='Count') + summary = summary.sort_values(by=['Size', 'Count'], ascending=[False, False]).head(20) + st.session_state['scan_results'] = summary + else: + st.warning("No text found.") + + if 'scan_results' in st.session_state: + st.write("### Top Large Fonts Found") + st.dataframe(st.session_state['scan_results'], use_container_width=True) + + def update_from_scan(): + val = st.session_state.scan_selector + if val: + f_name = val.split(" (")[0] + f_size = float(val.split("(")[1].replace("pt)", "")) + st.session_state['font_name'] = f_name + st.session_state['font_size'] = f_size + + options = st.session_state['scan_results'].apply(lambda x: f"{x['Font']} ({x['Size']}pt)", axis=1) + st.selectbox("Select extraction font:", options, key='scan_selector', on_change=update_from_scan, index=None, placeholder="Choose a font...") + + with tab2: + search_query = st.text_input("Enter text to find (e.g., 'Chapter 1')", "") + + c1, c2 = st.columns([1, 3]) + with c1: + do_search = st.button("Search Text") + with c2: + is_case_sensitive = st.checkbox("Case Sensitive", value=False) + + if do_search: + with st.spinner(f"Searching for '{search_query}'..."): + # Use the robust pdfxmeta library + try: + doc = fitz.open(input_pdf_path) + # pdfxmeta expects a regex pattern, so we escape the query to be safe + import re + safe_pattern = re.escape(search_query) + + # extract_meta returns a list of dicts (spans) + results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not is_case_sensitive)) + doc.close() + + matches = [] + for res in results: + matches.append({ + "Text": res.get("text", "").strip(), + "Font": res.get("font", ""), + "Size": round(res.get("size", 0), 2), + "Page": res.get("page_index", 0) + }) + # Limit for display safety + if len(matches) > 50: break + + if matches: + st.session_state['search_matches'] = pd.DataFrame(matches) + else: + st.warning("No matches found.") + + except Exception as e: + st.error(f"Search failed: {e}") + + if 'search_matches' in st.session_state: + st.write(f"### Found Matches") + st.dataframe(st.session_state['search_matches'], use_container_width=True) + + def update_from_search(): + val = st.session_state.search_selector + if val: + parts = val.split(" (") + f_name = parts[0] + f_size = float(parts[1].split("pt)")[0]) + st.session_state['font_name'] = f_name + st.session_state['font_size'] = f_size + + options = st.session_state['search_matches'].apply(lambda x: f"{x['Font']} ({x['Size']}pt) - Pg {x['Page']}", axis=1) + st.selectbox("Select font from match:", options, key='search_selector', on_change=update_from_search, index=None, placeholder="Choose a match...") + + # --- Configuration (Only for Generate) --- + st.header("3. Configure Recipe") + col1, col2 = st.columns(2) + with col1: + font_name_input = st.text_input("Font Name", key='font_name') + with col2: + font_size_input = st.number_input("Font Size", key='font_size') + + greedy = st.checkbox("Greedy Match (Merge multiline specs)", value=True) + + # --- Back Matter Configuration --- + with st.expander("Back Matter Configuration (Optional)", expanded=False): + st.markdown("Identify where the **Back Matter** (Index, Glossary, etc.) starts to split it into a separate `999_Back_matter.pdf`.") + + # Independent Search for Back Matter + bm_query = st.text_input("Find Back Matter start (e.g., 'Index')", key="bm_search_query") + + c_bm1, c_bm2 = st.columns([1, 3]) + with c_bm1: + do_bm_search = st.button("Search Back Matter") + with c_bm2: + bm_case_sensitive = st.checkbox("Case Sensitive", key="bm_sens", value=False) + + if do_bm_search: + with st.spinner("Searching..."): + try: + doc = fitz.open(input_pdf_path) + import re + safe_pattern = re.escape(bm_query) + results = pdfxmeta.extract_meta(doc, safe_pattern, ign_case=(not bm_case_sensitive)) + doc.close() + + bm_matches = [] + for res in results: + bm_matches.append({ + "Text": res.get("text", "").strip(), + "Page": res.get("page_index", 0) # Display raw (already 1-based from pdfxmeta) + }) + if len(bm_matches) > 50: break + + if bm_matches: + st.session_state['bm_matches'] = pd.DataFrame(bm_matches) + else: + st.warning("No matches found.") + except Exception as e: + st.error(f"Search failed: {e}") + + if 'bm_matches' in st.session_state: + st.dataframe(st.session_state['bm_matches'], use_container_width=True) + + def update_bm_page(): + val = st.session_state.bm_selector + if val: + # Value format: "Page X - Text..." + page_num = int(val.split(" -")[0].replace("Page ", "")) + st.session_state['back_matter_page'] = page_num + + bm_options = st.session_state['bm_matches'].apply(lambda x: f"Page {x['Page']} - {x['Text'][:30]}...", axis=1) + st.selectbox("Select Start Page:", bm_options, key='bm_selector', on_change=update_bm_page, index=None, placeholder="Select start page...") + + # Manual Override + # Update session state when this input changes + def update_manual_bm(): + st.session_state['back_matter_page'] = st.session_state.back_matter_page_manual + + st.number_input("Or manually set Start Page:", min_value=0, value=st.session_state.get('back_matter_page', 0), key='back_matter_page_manual', on_change=update_manual_bm) + + else: + # Existing Mode + st.info("Using existing bookmarks. They will be cleaned, numbered, and used for splitting/downloading.") + + # --- Generation --- + st.header("4. Process & Generate") + + if st.button("Run Pipeline"): + # Validate inputs if generating + if source_mode == "Scan & Generate (Create New)" and not st.session_state.get('font_name'): + st.error("Please specify a font name for extraction.") + else: + with st.status("Running pipeline tasks...", expanded=True) as status: + # Use a temporary directory for all intermediate files + with tempfile.TemporaryDirectory() as temp_dir: + status.write(f"Created temp workspace: {temp_dir}") + + # Paths + recipe_path = os.path.join(temp_dir, "recipe.toml") + raw_toc_path = os.path.join(temp_dir, "raw.toc") # pdftocgen output + clean_toc_path = os.path.join(temp_dir, "clean.toc") # modify_toc output + output_pdf_path = os.path.join(temp_dir, "final.pdf") + + raw_toc_content = "" + + if source_mode == "Scan & Generate (Create New)": + # 1. Create Recipe + recipe_data = { + "heading": [{ + "level": 1, + "greedy": greedy, + "font": { + "name": st.session_state['font_name'], + "size": st.session_state['font_size'], + "size_tolerance": 0.1 + } + }] + } + with open(recipe_path, "w") as f: + toml.dump(recipe_data, f) + status.write("✅ Recipe created") + + # 2. Run pdftocgen -> raw.toc + status.write("Running pdftocgen (Scanning)...") + cmd1 = f'pdftocgen -r "{recipe_path}" "{input_pdf_path}"' + process = subprocess.run(cmd1, shell=True, capture_output=True, text=True, encoding='utf-8') + if process.returncode != 0: + st.error(f"pdftocgen failed: {process.stderr}") + st.stop() + raw_toc_content = process.stdout + status.write("✅ Headers extracted") + + else: + # Existing Bookmarks + status.write("Extracting existing bookmarks...") + # Run pdftocio in extract mode + cmd1 = f'pdftocio "{input_pdf_path}"' + process = subprocess.run(cmd1, shell=True, capture_output=True, text=True, encoding='utf-8') + if process.returncode != 0: + st.error(f"pdftocio failed: {process.stderr}") + st.stop() + raw_toc_content = process.stdout + if not raw_toc_content.strip(): + st.warning("No existing bookmarks found!") + st.stop() + status.write("✅ Existing bookmarks imported") + + # 3. Clean Content (Using centralized utility) + status.write("Cleaning and merging bookmarks...") + cleaned_toc_content = toc_processor.process_toc(raw_toc_content) + + with open(clean_toc_path, "w", encoding='utf-8') as f: + f.write(cleaned_toc_content) + status.write("✅ Bookmarks formatted (Double-splits fixed)") + + # 4. Write PDF + status.write("Writing to PDF...") + cmd3 = f'pdftocio -t "{clean_toc_path}" -o "{output_pdf_path}" "{input_pdf_path}"' + process = subprocess.run(cmd3, shell=True, capture_output=True, text=True) + if process.returncode != 0: + st.error(f"pdftocio failed: {process.stderr}") + st.stop() + status.write("✅ PDF saved") + + # 5. Read Result for Download + with open(output_pdf_path, "rb") as f: + st.session_state['final_pdf_bytes'] = f.read() + + # 6. Split & Zip (The Feature) + # Use a temp file for the zip to avoid memory issues + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp_zip: + tmp_zip_path = tmp_zip.name + + try: + # Pass back_matter_page if it exists and is valid + bm_page = st.session_state.get('back_matter_page', 0) + if bm_page == 0: bm_page = None + + toc_processor.generate_chapter_splits(output_pdf_path, tmp_zip_path, back_matter_start_page=bm_page) + + with open(tmp_zip_path, "rb") as f: + st.session_state['final_zip_bytes'] = f.read() + + base_name = os.path.splitext(uploaded_file.name)[0] + st.session_state['final_zip_name'] = f"{base_name}_chapters.zip" + + except Exception as e: + st.error(f"Error generating zip: {e}") + finally: + if os.path.exists(tmp_zip_path): + os.unlink(tmp_zip_path) + + # --- Persistent Download Area --- + if 'final_pdf_bytes' in st.session_state: + st.success("Pipeline completed successfully!") + st.write("### Downloads") + + c_dl1, c_dl2 = st.columns(2) + with c_dl1: + st.download_button( + label="Download Bookmarked PDF", + data=st.session_state['final_pdf_bytes'], + file_name="bookmarked_doc.pdf", + mime="application/pdf", + key="dl_pdf_btn" + ) + + with c_dl2: + if 'final_zip_bytes' in st.session_state: + st.download_button( + label=f"Download ZIP ({st.session_state['final_zip_name']})", + data=st.session_state['final_zip_bytes'], + file_name=st.session_state['final_zip_name'], + mime="application/zip", + key="dl_zip_btn" + ) + + st.markdown("---") + st.markdown(""" +
+ Based on pdf.tocgen by krasjet.
+ Enhanced with UI, Chapter Splitting, and Metadata Search. Licensed under AGPL-3.0. +
+ """, unsafe_allow_html=True) diff --git a/fitzutils/__init__.py b/fitzutils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5af8abf9c7e0f10603381e66862c2618a1aa0ffd --- /dev/null +++ b/fitzutils/__init__.py @@ -0,0 +1,17 @@ +"""A collection of utility functions to work with PyMuPDF""" + +from .fitzutils import ( + open_pdf, + ToCEntry, + dump_toc, + pprint_toc, + get_file_encoding +) + +__all__ = [ + 'open_pdf', + 'ToCEntry', + 'dump_toc', + 'pprint_toc', + 'get_file_encoding' +] diff --git a/fitzutils/fitzutils.py b/fitzutils/fitzutils.py new file mode 100644 index 0000000000000000000000000000000000000000..1f1167b3c2e9308e32fc2ad43d66bca3cbfc8ec7 --- /dev/null +++ b/fitzutils/fitzutils.py @@ -0,0 +1,112 @@ +from contextlib import contextmanager +from dataclasses import dataclass +from typing import Optional, ContextManager, List, Tuple +from fitz import Document + +import sys +import fitz +import io +import csv +import chardet + + +@contextmanager +def open_pdf(path: str, + exit_on_error: bool = True + ) -> ContextManager[Optional[Document]]: + """A context manager for fitz Document + + This context manager will take care of the error handling when creating a + fitz Document. + + Arguments + path: the path of the pdf file + exit_on_error: if true, exit with error code 1 when error occurs + """ + try: + doc = fitz.open(path) + except Exception as e: + if exit_on_error: + print(f"error: fail to open {path}", file=sys.stderr) + print(e, file=sys.stderr) + sys.exit(1) + else: + yield None + else: + try: + yield doc + finally: + doc.close() + + +@dataclass +class ToCEntry: + """A single entry in the table of contents""" + level: int + title: str + pagenum: int + # vpos == bbox.top, used for sorting + vpos: Optional[float] = None + + @staticmethod + def key(e) -> Tuple[int, float]: + """Key used for sorting""" + return (e.pagenum, 0 if e.vpos is None else e.vpos) + + def to_fitz_entry(self) -> list: + return ([self.level, self.title, self.pagenum] + + [self.vpos] * (self.vpos is not None)) + + +def dump_toc(entries: List[ToCEntry], dump_vpos: bool = False) -> str: + """Dump table of contents as a CSV dialect + + We will use indentations to represent the level of each entry, except that, + everything should be similar to the normal CSV. + + Argument + entries: a list of ToC entries + dump_vpos: if true, the vertical position of a page is also dumped + Returns + a multiline string + """ + with io.StringIO(newline='\n') as out: + writer = csv.writer(out, lineterminator='\n', + delimiter=' ', quoting=csv.QUOTE_NONNUMERIC) + for entry in entries: + out.write((entry.level - 1) * ' ') + writer.writerow( + [entry.title, entry.pagenum] + + ([entry.vpos] * (dump_vpos and entry.vpos is not None)) + ) + return out.getvalue() + + +def pprint_toc(entries: List[ToCEntry]) -> str: + """Pretty print table of contents + + Argument + entries: a list of ToC entries + Returns + a multiline string + """ + return '\n'.join([ + f"{(entry.level - 1) * ' '}{entry.title} ··· {entry.pagenum}" + for entry in entries + ]) + + +def get_file_encoding(path: str) -> str: + """Get encoding of file + + Argument + path: file path + Returns + encoding string + """ + try: + with open(path, "rb") as f: + enc = chardet.detect(f.read()).encoding + except: + enc = 'utf-8' + return enc diff --git a/pdftocgen/__init__.py b/pdftocgen/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6cbb261c96a596ae893408f22ae843761788fb --- /dev/null +++ b/pdftocgen/__init__.py @@ -0,0 +1,3 @@ +"""Generate table of contents for pdf based on a recipe file""" + +__version__ = '1.3.4' diff --git a/pdftocgen/__main__.py b/pdftocgen/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be --- /dev/null +++ b/pdftocgen/__main__.py @@ -0,0 +1,4 @@ +from .app import main + +if __name__ == '__main__': + main() diff --git a/pdftocgen/filter.py b/pdftocgen/filter.py new file mode 100644 index 0000000000000000000000000000000000000000..00b04b8571df94129fb75aa9f345a219b956583d --- /dev/null +++ b/pdftocgen/filter.py @@ -0,0 +1,161 @@ +"""Filter on span dictionaries + +This module contains the internal representation of heading filters, which are +used to test if a span should be included in the ToC. +""" + +import re + +from typing import Optional +from re import Pattern + +DEF_TOLERANCE: float = 1e-5 + + +def admits_float(expect: Optional[float], + actual: Optional[float], + tolerance: float) -> bool: + """Check if a float should be admitted by a filter""" + return (expect is None) or \ + (actual is not None and abs(expect - actual) <= tolerance) + + +class FontFilter: + """Filter on font attributes""" + name: Pattern + size: Optional[float] + size_tolerance: float + color: Optional[int] + flags: int + # besides the usual true (1) and false (0), we have another state, + # unset (x), where the truth table would be + # a b diff? + # 0 0 0 + # 0 1 1 + # 1 0 1 + # 1 1 0 + # x 0 0 + # x 1 0 + # it's very inefficient to compare bit by bit, which would take 5 bitwise + # operations to compare, and then 4 to combine the results, we will use a + # trick to reduce it to 2 ops. + # step 1: use XOR to find different bits. if unset, set bit to 0, we will + # take care of false positives in the next step + # a b a^b + # 0 0 0 + # 0 1 1 + # 1 0 1 + # 1 1 0 + # step 2: use AND with a ignore mask, (0 for ignored) to eliminate false + # positives + # a b a&b + # 0 1 0 <- no diff + # 0 0 0 <- no diff + # 1 1 1 <- found difference + # 1 0 0 <- ignored + ign_mask: int + + def __init__(self, font_dict: dict): + self.name = re.compile(font_dict.get('name', "")) + self.size = font_dict.get('size') + self.size_tolerance = font_dict.get('size_tolerance', DEF_TOLERANCE) + self.color = font_dict.get('color') + # some branchless trick, mainly to save space + # x * True = x + # x * False = 0 + self.flags = (0b00001 * font_dict.get('superscript', False) | + 0b00010 * font_dict.get('italic', False) | + 0b00100 * font_dict.get('serif', False) | + 0b01000 * font_dict.get('monospace', False) | + 0b10000 * font_dict.get('bold', False)) + + self.ign_mask = (0b00001 * ('superscript' in font_dict) | + 0b00010 * ('italic' in font_dict) | + 0b00100 * ('serif' in font_dict) | + 0b01000 * ('monospace' in font_dict) | + 0b10000 * ('bold' in font_dict)) + + def admits(self, spn: dict) -> bool: + """Check if the font attributes admit the span + + Argument + spn: the span dict to be checked + Returns + False if the span doesn't match current font attribute + """ + if not self.name.search(spn.get('font', "")): + return False + + if self.color is not None and self.color != spn.get('color'): + return False + + if not admits_float(self.size, spn.get('size'), self.size_tolerance): + return False + + flags = spn.get('flags', ~self.flags) + # see above for explanation + return not (flags ^ self.flags) & self.ign_mask + + +class BoundingBoxFilter: + """Filter on bounding boxes""" + left: Optional[float] + top: Optional[float] + right: Optional[float] + bottom: Optional[float] + tolernace: float + + def __init__(self, bbox_dict: dict): + self.left = bbox_dict.get('left') + self.top = bbox_dict.get('top') + self.right = bbox_dict.get('right') + self.bottom = bbox_dict.get('bottom') + self.tolerance = bbox_dict.get('tolerance', DEF_TOLERANCE) + + def admits(self, spn: dict) -> bool: + """Check if the bounding box admit the span + + Argument + spn: the span dict to be checked + Returns + False if the span doesn't match current bounding box setting + """ + bbox = spn.get('bbox', (None, None, None, None)) + return (admits_float(self.left, bbox[0], self.tolerance) and + admits_float(self.top, bbox[1], self.tolerance) and + admits_float(self.right, bbox[2], self.tolerance) and + admits_float(self.bottom, bbox[3], self.tolerance)) + + +class ToCFilter: + """Filter on span dictionary to pick out headings in the ToC""" + # The level of the title, strictly > 0 + level: int + # When set, the filter will be more *greedy* and extract all the text in a + # block even when at least one match occurs + greedy: bool + font: FontFilter + bbox: BoundingBoxFilter + + def __init__(self, fltr_dict: dict): + lvl = fltr_dict.get('level') + + if lvl is None: + raise ValueError("filter's 'level' is not set") + if lvl < 1: + raise ValueError("filter's 'level' must be >= 1") + + self.level = lvl + self.greedy = fltr_dict.get('greedy', False) + self.font = FontFilter(fltr_dict.get('font', {})) + self.bbox = BoundingBoxFilter(fltr_dict.get('bbox', {})) + + def admits(self, spn: dict) -> bool: + """Check if the filter admits the span + + Arguments + spn: the span dict to be checked + Returns + False if the span doesn't match the filter + """ + return self.font.admits(spn) and self.bbox.admits(spn) diff --git a/pdftocgen/recipe.py b/pdftocgen/recipe.py new file mode 100644 index 0000000000000000000000000000000000000000..010bb31fa61e5edb29792a33fd4c6931bf3fdbee --- /dev/null +++ b/pdftocgen/recipe.py @@ -0,0 +1,188 @@ +from dataclasses import dataclass +from typing import Optional, List, Dict, Iterator +from .filter import ToCFilter +from fitzutils import ToCEntry +from itertools import chain +from collections import defaultdict +from fitz import Document + + +class FoundGreedy(Exception): + """A hacky solution to do short-circuiting in Python. + + The main reason to do this short-circuiting is to untangle the logic of + greedy filter with normal execution, which makes the typing and code much + cleaner, but it can also save some unecessary comparisons. + + Probably similar to call/cc in scheme or longjump in C + c.f. https://ds26gte.github.io/tyscheme/index-Z-H-15.html#node_sec_13.2 + """ + level: int + + def __init__(self, level): + """ + Argument + level: level of the greedy filter + """ + super().__init__() + self.level = level + + +def blk_to_str(blk: dict) -> str: + """Extract all the text inside a block""" + return " ".join([ + spn.get('text', "").strip() + for line in blk.get('lines', []) + for spn in line.get('spans', []) + ]) + + +@dataclass +class Fragment: + """A fragment of the extracted heading""" + text: str + level: int + + +def concatFrag(frags: Iterator[Optional[Fragment]], sep: str = " ") -> Dict[int, str]: + """Concatenate fragments to strings + + Returns + a dictionary (level -> title) that contains the title for each level. + """ + # accumulate a list of strings for each level of heading + acc = defaultdict(list) + for frag in frags: + if frag is not None: + acc[frag.level].append(frag.text) + + result = {} + for level, strs in acc.items(): + result[level] = sep.join(strs) + return result + + +class Recipe: + """The internal representation of a recipe""" + filters: List[ToCFilter] + + def __init__(self, recipe_dict: dict): + fltr_dicts = recipe_dict.get('heading', []) + + if len(fltr_dicts) == 0: + raise ValueError("no filters found in recipe") + self.filters = [ToCFilter(fltr) for fltr in fltr_dicts] + + def _extract_span(self, spn: dict) -> Optional[Fragment]: + """Extract text from span along with level + + Argument + spn: a span dictionary + { + 'bbox': (float, float, float, float), + 'color': int, + 'flags': int, + 'font': str, + 'size': float, + 'text': str + } + Returns + a fragment of the heading or None if no match + """ + for fltr in self.filters: + if fltr.admits(spn): + text = spn.get('text', "").strip() + + if not text: + # don't match empty spaces + return None + + if fltr.greedy: + # propagate all the way back to extract_block + raise FoundGreedy(fltr.level) + + return Fragment(text, fltr.level) + return None + + def _extract_line(self, line: dict) -> List[Optional[Fragment]]: + """Extract matching heading fragments in a line. + + Argument + line: a line dictionary + { + 'bbox': (float, float, float, float), + 'wmode': int, + 'dir': (float, float), + 'spans': [dict] + } + Returns + a list of fragments concatenated from result in a line + """ + return [self._extract_span(spn) for spn in line.get('spans', [])] + + def extract_block(self, block: dict, page: int) -> List[ToCEntry]: + """Extract matching headings in a block. + + Argument + block: a block dictionary + { + 'bbox': (float, float, float, float), + 'lines': [dict], + 'type': int + } + Returns + a list of toc entries, concatenated from the result of lines + """ + if block.get('type') != 0: + # not a text block + return [] + + vpos = block.get('bbox', (0, 0))[1] + + try: + frags = chain.from_iterable([ + self._extract_line(ln) for ln in block.get('lines') + ]) + titles = concatFrag(frags) + + return [ + ToCEntry(level, title, page, vpos) + for level, title in titles.items() + ] + except FoundGreedy as e: + # Smart Greedy: Only merged text that MATCHES the filter + # Find the filter that triggered this level + relevant_filter = next((f for f in self.filters if f.level == e.level), None) + + parts = [] + if relevant_filter: + for ln in block.get('lines', []): + for spn in ln.get('spans', []): + if relevant_filter.admits(spn): + parts.append(spn.get('text', "").strip()) + + merged_text = " ".join(parts) + if merged_text: + return [ToCEntry(e.level, merged_text, page, vpos)] + else: + return [] + + +def extract_toc(doc: Document, recipe: Recipe) -> List[ToCEntry]: + """Extract toc entries from a document + + Arguments + doc: a pdf document + recipe: recipe from user + Returns + a list of toc entries in the document + """ + result = [] + + for page in doc.pages(): + for blk in page.get_textpage().extractDICT().get('blocks', []): + result.extend( + recipe.extract_block(blk, page.number + 1) + ) + + return result diff --git a/pdftocgen/tocgen.py b/pdftocgen/tocgen.py new file mode 100644 index 0000000000000000000000000000000000000000..6fcb9d21c15c57cec9898c17677d614b2f4351ad --- /dev/null +++ b/pdftocgen/tocgen.py @@ -0,0 +1,15 @@ +from fitz import Document +from typing import List +from fitzutils import ToCEntry +from .recipe import Recipe, extract_toc + +def gen_toc(doc: Document, recipe_dict: dict) -> List[ToCEntry]: + """Generate the table of content for a document from recipe + + Argument + doc: a pdf document + recipe_dict: the recipe dictionary used to generate the toc + Returns + a list of ToC entries + """ + return extract_toc(doc, Recipe(recipe_dict)) diff --git a/pdftocio/__init__.py b/pdftocio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fe25c642e35a38b9dce6750edd85e315434b27d8 --- /dev/null +++ b/pdftocio/__init__.py @@ -0,0 +1,3 @@ +"""Manipulating the table of contents of a pdf""" + +__version__ = '1.3.4' diff --git a/pdftocio/__main__.py b/pdftocio/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be --- /dev/null +++ b/pdftocio/__main__.py @@ -0,0 +1,4 @@ +from .app import main + +if __name__ == '__main__': + main() diff --git a/pdftocio/app.py b/pdftocio/app.py new file mode 100644 index 0000000000000000000000000000000000000000..5eb14f579b912c2b00e382ba78ce695bb16d97d0 --- /dev/null +++ b/pdftocio/app.py @@ -0,0 +1,184 @@ +"""The executable of pdftocio""" + +import sys +import os.path +import pdftocio +import getopt +import io + +from typing import Optional, TextIO +from getopt import GetoptError +from fitzutils import open_pdf, dump_toc, pprint_toc, get_file_encoding +from .tocparser import parse_toc +from .tocio import write_toc, read_toc + +usage_s = """ +usage: pdftocio [options] in.pdf < toc + pdftocio [options] in.pdf +""".strip() + +help_s = r""" +usage: pdftocio [options] in.pdf < toc + pdftocio [options] in.pdf + +Import/output the table of contents of a PDF file. + +This command can operate in two ways: it can either be used +to extract the table of contents of a PDF, or import table +of contents to a PDF using the output of pdftocgen. + +1. To extract the table of contents of a PDF for + modification, only supply a input file: + + $ pdftocio in.pdf + + or if you want to print it in a readable format, use the + -H flag: + + $ pdftocio -H in.pdf + +2. To import a table of contents to a PDF using the toc file + generated by pdftocgen, use input redirection, + + $ pdftocio in.pdf < toc + + pipes, + + $ pdftocgen -r recipe.toml in.pdf | pdftocio in.pdf + + or the -t flag + + $ pdftocio -t toc in.pdf + + to supply the toc file. If you want to specify an output + file name, use the -o option + + $ pdftocio -t toc -o out.pdf in.pdf + +arguments + in.pdf path to the input PDF document + +options + -h, --help show help + -t, --toc=toc path to the table of contents generated by + pdftocgen. if this option is not given, the + default is stdin, but if no input is piped or + redirected to stdin, this program will instead + print the existing ToC of the PDF file + -v, --vpos if this flag is set, the vertical position of + each heading will be dumped to the output + -p, --print when flag is set, print the existing ToC in + the input PDF file. this flag is usually not + necessary, since it is the default behavior + when no input is given + -H, --human-readable print the toc in a readable format + -o, --out=file.pdf path to the output file. if this flag is not + specified, the default is {input}_out.pdf + -g, --debug enable debug mode + -V, --version show version number + +[1]: https://krasjet.com/voice/pdf.tocgen/#step-1-build-a-recipe +""".strip() + + +def main(): + # parse arguments + try: + opts, args = getopt.gnu_getopt( + sys.argv[1:], + "hvt:pHo:gV", + ["help", "vpos", "toc=", "print", "human-readable", "out=", "debug", "version"] + ) + except GetoptError as e: + print(e, file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(2) + + toc_file: TextIO = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='ignore') + print_toc: bool = False + readable: bool = False + out: Optional[str] = None + vpos: bool = False + debug: bool = False + + for o, a in opts: + if o in ("-H", "--human-readable"): + readable = True + elif o in ("-p", "--print"): + print_toc = True + elif o in ("-v", "--vpos"): + vpos = True + elif o in ("-t", "--toc"): + try: + toc_file = open(a, "r", encoding=get_file_encoding(a)) + except IOError as e: + print("error: can't open file for reading", file=sys.stderr) + print(e, file=sys.stderr) + sys.exit(1) + elif o in ("-o", "--out"): + out = a + elif o in ("-g", "--debug"): + debug = True + elif o in ("-V", "--version"): + print("pdftocio", pdftocio.__version__, file=sys.stderr) + sys.exit() + elif o in ("-h", "--help"): + print(help_s, file=sys.stderr) + sys.exit() + + if len(args) < 1: + print("error: no input pdf is given", file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(1) + + path_in: str = args[0] + # done parsing arguments + + try: + with open_pdf(path_in) as doc: + if toc_file.isatty() or print_toc: + # no input from user, switch to output mode and extract the toc + # of pdf + toc = read_toc(doc) + if len(toc) == 0: + print("error: no table of contents found", file=sys.stderr) + sys.exit(1) + + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') + + if readable: + print(pprint_toc(toc), file=stdout) + else: + print(dump_toc(toc, vpos), end="", file=stdout) + sys.exit(0) + + # an input is given, so switch to input mode + toc = parse_toc(toc_file) + write_toc(doc, toc) + + if out is None: + # add suffix to input name as output + pfx, ext = os.path.splitext(path_in) + out = f"{pfx}_out{ext}" + doc.save(out) + except ValueError as e: + if debug: + raise e + print("error:", e, file=sys.stderr) + sys.exit(1) + except IOError as e: + if debug: + raise e + print("error: unable to open file", file=sys.stderr) + print(e, file=sys.stderr) + sys.exit(1) + except IndexError as e: + if debug: + raise e + print("index error:", e, file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt as e: + if debug: + raise e + print("error: interrupted", file=sys.stderr) + sys.exit(1) diff --git a/pdftocio/tocio.py b/pdftocio/tocio.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f4fdbc24aefddfd72d3f0bcf72eabe8dfaa3e8 --- /dev/null +++ b/pdftocio/tocio.py @@ -0,0 +1,20 @@ +"""Reading and writing table of contents from/to a pdf""" + +from typing import List +from fitz import Document +from fitzutils import ToCEntry + + +def write_toc(doc: Document, toc: List[ToCEntry]): + """Write table of contents to a document""" + fitz_toc = list(map(lambda e: e.to_fitz_entry(), toc)) + doc.set_toc(fitz_toc) + + +def read_toc(doc: Document) -> List[ToCEntry]: + """Read table of contents from a document""" + return [ + ToCEntry(e[0], e[1], e[2], e[3]['to'].y) if (len(e) == 4 and 'to' in e[3]) else + ToCEntry(e[0], e[1], e[2]) + for e in doc.get_toc(False) + ] diff --git a/pdftocio/tocparser.py b/pdftocio/tocparser.py new file mode 100644 index 0000000000000000000000000000000000000000..5d4e6c3031da4d4f052dc617c03ec964d33f81ef --- /dev/null +++ b/pdftocio/tocparser.py @@ -0,0 +1,38 @@ +"""Parser for table of content csv file""" + +import csv +import sys + +from typing import IO, List +from fitzutils import ToCEntry +from itertools import takewhile + + +def parse_entry(entry: List) -> ToCEntry: + """parse a row in csv to a toc entry""" + + # a somewhat weird hack, csv reader would read spaces as an empty '', so we + # only need to count the number of '' before an entry to determined the + # heading level + indent = len(list(takewhile(lambda x: x == '', entry))) + try: + toc_entry = ToCEntry( + int(indent / 4) + 1, # 4 spaces = 1 level + entry[indent], # heading + int(entry[indent + 1]), # pagenum + *entry[indent + 2:] # vpos + ) + return toc_entry + except IndexError as e: + print(f"Unable to parse toc entry {entry};", + f"Need at least {indent + 2} parts but only have {len(entry)}.", + "Make sure the page number is present.", + file=sys.stderr) + raise e + + +def parse_toc(file: IO) -> List[ToCEntry]: + """Parse a toc file to a list of toc entries""" + reader = csv.reader(file, lineterminator='\n', + delimiter=' ', quoting=csv.QUOTE_NONNUMERIC) + return list(map(parse_entry, reader)) diff --git a/pdfxmeta/__init__.py b/pdfxmeta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..207212c694c3bb9c517747e1e6953ce3666ad8b0 --- /dev/null +++ b/pdfxmeta/__init__.py @@ -0,0 +1,5 @@ +"""Extract metadata (fonts, bounding box) for a string in a pdf""" + +__version__ = '1.3.4' + +from .pdfxmeta import extract_meta, dump_meta, dump_toml diff --git a/pdfxmeta/__main__.py b/pdfxmeta/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..528380d62037100b8f2f0746d849a6ff67e2e3be --- /dev/null +++ b/pdfxmeta/__main__.py @@ -0,0 +1,4 @@ +from .app import main + +if __name__ == '__main__': + main() diff --git a/pdfxmeta/app.py b/pdfxmeta/app.py new file mode 100644 index 0000000000000000000000000000000000000000..3d27c9f31430574a39e4c0001fd9b93f558c8174 --- /dev/null +++ b/pdfxmeta/app.py @@ -0,0 +1,147 @@ +"""The executable of pdfxmeta""" + +import getopt +import sys +import pdfxmeta +import io + +from getopt import GetoptError +from typing import Optional, TextIO +from fitzutils import open_pdf +from textwrap import indent +from pdfxmeta import dump_meta, dump_toml, extract_meta + + +usage_s = """ +usage: pdfxmeta [options] doc.pdf [pattern] +""".strip() + +help_s = """ +usage: pdfxmeta [options] doc.pdf [pattern] + +Extract the metadata for pattern in doc.pdf. + +To use this command, first open up the pdf file with your +favorite pdf reader and find the text you want to search +for. Then use + + $ pdfxmeta -p 1 in.pdf "Subsection One" + +to find the metadata, mainly the font attributes and +bounding box, of lines containing the pattern "Subsection +One" on page 1. Specifying a page number is optional but +highly recommended, since it greatly reduces the ambiguity +of matches and execution time. + +The output of this command can be directly copy-pasted to +build a recipe file for pdftocgen. Alternatively, you could +also use the --auto or -a flag to output a valid heading +filter directly + + $ pdfxmeta -p 1 -a 2 in.pdf "Subsection One" >> recipe.toml + +where the argument of -a is the level of the heading filter, +which in this case is 2. + +arguments + doc.pdf path to the input PDF document + [pattern] the pattern to search for (python regex). if not + given, dump the entire document + +options + -h, --help show help + -p, --page=PAGE specify the page to search for (1-based index) + -i, --ignore-case when flag is set, search will be case-insensitive + -a, --auto=LEVEL when flag is set, the output would be a valid + heading filter of the specified heading level in + default settings. it is directly usable by + pdftocgen. + -o, --out=FILE path to the output file. if this flag is not + specified, the default is stdout + -V, --version show version number +""".strip() + + +def print_result(meta: dict) -> str: + """pretty print results in a structured manner""" + return f"{meta.get('text', '')}:\n{indent(dump_meta(meta), ' ')}" + + +def main(): + # parse arguments + try: + opts, args = getopt.gnu_getopt( + sys.argv[1:], + "hiVp:a:o:", + ["help", "ignore-case", "version", "page=", "auto=", "out="] + ) + except GetoptError as e: + print(e, file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(2) + + ignore_case: bool = False + page: Optional[int] = None + auto_level: Optional[int] = None + out: TextIO = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='ignore') + + for o, a in opts: + if o in ("-i", "--ignore-case"): + ignore_case = True + elif o in ("-p", "--page"): + try: + page = int(a) + except ValueError as e: + print("error: invalid page number", file=sys.stderr) + sys.exit(1) + elif o in ("-a", "--auto"): + try: + auto_level = int(a) + except ValueError as e: + print("error: invalid level", file=sys.stderr) + sys.exit(1) + elif o in ("-o", "--out"): + try: + out = open(a, "w", encoding='utf-8', errors='ignore') + except IOError as e: + print("error: can't open file for writing", file=sys.stderr) + print(e, file=sys.stderr) + sys.exit(1) + elif o in ("-V", "--version"): + print("pdfxmeta", pdfxmeta.__version__, file=sys.stderr) + sys.exit() + elif o in ("-h", "--help"): + print(help_s, file=sys.stderr) + sys.exit() + + argc = len(args) + + if argc < 1: + print("error: no input pdf is given", file=sys.stderr) + print(usage_s, file=sys.stderr) + sys.exit(1) + + path_in: str = args[0] + pattern: str = "" + + if argc >= 2: + pattern = args[1] + + # done parsing arguments + + with open_pdf(path_in) as doc: + meta = extract_meta(doc, pattern, page, ignore_case) + + # nothing found + if len(meta) == 0: + sys.exit(1) + + # should we add \n between each output? + addnl = not out.isatty() + + if auto_level: + print('\n'.join( + [dump_toml(m, auto_level, addnl) for m in meta] + ), file=out) + else: + print('\n'.join(map(print_result, meta)), file=out) diff --git a/pdfxmeta/pdfxmeta.py b/pdfxmeta/pdfxmeta.py new file mode 100644 index 0000000000000000000000000000000000000000..acdb949feefd1a3057074362bc46e3c418eba8c1 --- /dev/null +++ b/pdfxmeta/pdfxmeta.py @@ -0,0 +1,194 @@ +"""Extract metadata for a string in a pdf file""" + +from toml.encoder import _dump_str, _dump_float + +import re + +from fitz import Document, Page +from typing import Optional, List + + +def extract_meta(doc: Document, + pattern: str, + page: Optional[int] = None, + ign_case: bool = False + ) -> List[dict]: + """Extract meta for a `pattern` on `page` in a pdf document + + Arguments + doc: document from pymupdf + pattern: a regular expression pattern + page: page number (1-based index), if None is given, search for the + entire document, but this is highly discouraged. + ign_case: ignore case? + """ + result = [] + + if page is None: + pages = doc.pages() + elif 1 <= page <= doc.page_count: + pages = [doc[page - 1]] + else: # page out of range + return result + + regex = re.compile( + pattern, + re.IGNORECASE + ) if ign_case else re.compile(pattern) + + # we could parallelize this, but I don't see a reason + # to *not* specify a page number + for p in pages: + found = search_in_page(regex, p) + for s in found: + s['page_index'] = p.number + 1 + try: + s['page_label'] = p.get_label() + except Exception: + # Fallback if get_label fails due to PyMuPDF version issues + s['page_label'] = "" + result.extend(found) + + return result + + +def search_in_page(regex: re.Pattern, page: Page) -> List[dict]: + """Search for `text` in `page` and extract meta using optimized search_for""" + result = [] + + # 1. Use simple string search if regex is just a literal (optimization) + # But since we have a compiled regex, we might need to extract the pattern if it's simple + # Or just use the regex to find matches in the FULL text of the page first? + # PyMuPDF's search_for takes a string. It doesn't support regex directly in wrapped core. + # However, for the purpose of this tool which claims regex support, we have a dilemma. + # But most users searching "Chapter 1" are doing literal searches. + + # If we want to support the user's "Divided World", we need to handle the case where it might be split. + # The most robust way for PDF text search is usually: + # 1. Get all text (with position). + # 2. Run regex on the full text. + # 3. Map match back to bbox. + # 4. Find spans in bbox. + + # BUT, to keep it simple and fix the immediate "spinning" and "missing" issue: + # The previous code iterated every span. + # Let's try to be smarter. + + # For now, let's assume the user pattern is often a literal or we can approximate it. + # If the user provides a regex, we can't easily use search_for. + # However, the user provided "Divided World". + + # Let's fallback to the robust get_text("dict") but optimize the check? + # No, get_text("dict") IS the slow part. + + # Alternative: + # Use page.get_text("text") -> run regex -> if match, THEN get_text("dict")? + # That saves time for pages that DON'T match. + + # Improved Algorithm: + # 1. Extract plain text of the page. + # 2. If regex doesn't match plain text, SKIP the page. (Huge optimization) + # 3. If it does match, perform the detailed span search. + + text_content = page.get_text() + if not regex.search(text_content): + return [] + + # If we are here, there is a match on this page. Now find the exact spans. + # Note: If the text is split across spans, the simple span iterator below will STILL fail to extract the specific span metadata for the *whole* match. + # But at least it won't spin on empty pages. + + page_meta = page.get_textpage().extractDICT() + + for blk in page_meta.get('blocks', []): + for ln in blk.get('lines', []): + for spn in ln.get('spans', []): + text = spn.get('text', "") + if regex.search(text): + result.append(spn) + return result + + +def to_bools(var: int) -> str: + """Convert int to lowercase bool string""" + return str(var != 0).lower() + + +def dump_meta(spn: dict) -> str: + """Dump the span dict from PyMuPDF to TOML compatible string""" + result = [] + + if 'page_index' in spn: + result.append(f"page.index = {spn['page_index']}") + if 'page_label' in spn: + result.append(f"page.label = \"{spn['page_label']}\"") + + result.append(f"font.name = {_dump_str(spn['font'])}") + result.append(f"font.size = {_dump_float(spn['size'])}") + result.append(f"font.color = {spn['color']:#08x}") + + flags = spn['flags'] + + result.append(f"font.superscript = {to_bools(flags & 0b00001)}") + result.append(f"font.italic = {to_bools(flags & 0b00010)}") + result.append(f"font.serif = {to_bools(flags & 0b00100)}") + result.append(f"font.monospace = {to_bools(flags & 0b01000)}") + result.append(f"font.bold = {to_bools(flags & 0b10000)}") + + bbox = spn['bbox'] + + result.append(f"bbox.left = {_dump_float(bbox[0])}") + result.append(f"bbox.top = {_dump_float(bbox[1])}") + result.append(f"bbox.right = {_dump_float(bbox[2])}") + result.append(f"bbox.bottom = {_dump_float(bbox[3])}") + + return '\n'.join(result) + + +def dump_toml(spn: dict, level: int, trail_nl: bool = False) -> str: + """Dump a valid TOML directly usable by pdftocgen + + Argument + spn: span dict of the heading + level: heading level + trail_nl: add trailing new line + Returns + a valid toml string + """ + result = [] + + result.append("[[heading]]") + result.append(f"# {spn.get('text', '')}") + result.append(f"level = {level}") + result.append("greedy = true") + + # strip font subset prefix + # == takeWhile (\c -> c /= '+') str + before, sep, after = spn['font'].partition('+') + font = after if sep else before + + result.append(f"font.name = {_dump_str(font)}") + result.append(f"font.size = {_dump_float(spn['size'])}") + result.append("# font.size_tolerance = 1e-5") + result.append(f"# font.color = {spn['color']:#08x}") + + flags = spn['flags'] + + result.append(f"# font.superscript = {to_bools(flags & 0b00001)}") + result.append(f"# font.italic = {to_bools(flags & 0b00010)}") + result.append(f"# font.serif = {to_bools(flags & 0b00100)}") + result.append(f"# font.monospace = {to_bools(flags & 0b01000)}") + result.append(f"# font.bold = {to_bools(flags & 0b10000)}") + + bbox = spn['bbox'] + + result.append(f"# bbox.left = {_dump_float(bbox[0])}") + result.append(f"# bbox.top = {_dump_float(bbox[1])}") + result.append(f"# bbox.right = {_dump_float(bbox[2])}") + result.append(f"# bbox.bottom = {_dump_float(bbox[3])}") + result.append("# bbox.tolerance = 1e-5") + + if trail_nl: + result.append("") + + return '\n'.join(result) diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000000000000000000000000000000000000..a86919f10476ffc87b01327ec1399cdd1bf39d9d --- /dev/null +++ b/poetry.lock @@ -0,0 +1,534 @@ +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. + +[[package]] +name = "args" +version = "0.1.0" +description = "Command Arguments for Humans." +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "args-0.1.0.tar.gz", hash = "sha256:a785b8d837625e9b61c39108532d95b85274acd679693b71ebb5156848fcf814"}, +] + +[[package]] +name = "astroid" +version = "2.11.7" +description = "An abstract syntax tree for Python with inference support." +category = "dev" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "astroid-2.11.7-py3-none-any.whl", hash = "sha256:86b0a340a512c65abf4368b80252754cda17c02cdbbd3f587dddf98112233e7b"}, + {file = "astroid-2.11.7.tar.gz", hash = "sha256:bb24615c77f4837c707669d16907331374ae8a964650a66999da3f5ca68dc946"}, +] + +[package.dependencies] +lazy-object-proxy = ">=1.4.0" +setuptools = ">=20.0" +typed-ast = {version = ">=1.4.0,<2.0", markers = "implementation_name == \"cpython\" and python_version < \"3.8\""} +typing-extensions = {version = ">=3.10", markers = "python_version < \"3.10\""} +wrapt = ">=1.11,<2" + +[[package]] +name = "chardet" +version = "5.1.0" +description = "Universal encoding detector for Python 3" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"}, + {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"}, +] + +[[package]] +name = "clint" +version = "0.5.1" +description = "Python Command Line Interface Tools" +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "clint-0.5.1.tar.gz", hash = "sha256:05224c32b1075563d0b16d0015faaf9da43aa214e4a2140e51f08789e7a4c5aa"}, +] + +[package.dependencies] +args = "*" + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "coverage" +version = "7.2.3" +description = "Code coverage measurement for Python" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "coverage-7.2.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e58c0d41d336569d63d1b113bd573db8363bc4146f39444125b7f8060e4e04f5"}, + {file = "coverage-7.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:344e714bd0fe921fc72d97404ebbdbf9127bac0ca1ff66d7b79efc143cf7c0c4"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:974bc90d6f6c1e59ceb1516ab00cf1cdfbb2e555795d49fa9571d611f449bcb2"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0743b0035d4b0e32bc1df5de70fba3059662ace5b9a2a86a9f894cfe66569013"}, + {file = "coverage-7.2.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d0391fb4cfc171ce40437f67eb050a340fdbd0f9f49d6353a387f1b7f9dd4fa"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4a42e1eff0ca9a7cb7dc9ecda41dfc7cbc17cb1d02117214be0561bd1134772b"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:be19931a8dcbe6ab464f3339966856996b12a00f9fe53f346ab3be872d03e257"}, + {file = "coverage-7.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:72fcae5bcac3333a4cf3b8f34eec99cea1187acd55af723bcbd559adfdcb5535"}, + {file = "coverage-7.2.3-cp310-cp310-win32.whl", hash = "sha256:aeae2aa38395b18106e552833f2a50c27ea0000122bde421c31d11ed7e6f9c91"}, + {file = "coverage-7.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:83957d349838a636e768251c7e9979e899a569794b44c3728eaebd11d848e58e"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dfd393094cd82ceb9b40df4c77976015a314b267d498268a076e940fe7be6b79"}, + {file = "coverage-7.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:182eb9ac3f2b4874a1f41b78b87db20b66da6b9cdc32737fbbf4fea0c35b23fc"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bb1e77a9a311346294621be905ea8a2c30d3ad371fc15bb72e98bfcfae532df"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca0f34363e2634deffd390a0fef1aa99168ae9ed2af01af4a1f5865e362f8623"}, + {file = "coverage-7.2.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55416d7385774285b6e2a5feca0af9652f7f444a4fa3d29d8ab052fafef9d00d"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:06ddd9c0249a0546997fdda5a30fbcb40f23926df0a874a60a8a185bc3a87d93"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:fff5aaa6becf2c6a1699ae6a39e2e6fb0672c2d42eca8eb0cafa91cf2e9bd312"}, + {file = "coverage-7.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ea53151d87c52e98133eb8ac78f1206498c015849662ca8dc246255265d9c3c4"}, + {file = "coverage-7.2.3-cp311-cp311-win32.whl", hash = "sha256:8f6c930fd70d91ddee53194e93029e3ef2aabe26725aa3c2753df057e296b925"}, + {file = "coverage-7.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:fa546d66639d69aa967bf08156eb8c9d0cd6f6de84be9e8c9819f52ad499c910"}, + {file = "coverage-7.2.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b2317d5ed777bf5a033e83d4f1389fd4ef045763141d8f10eb09a7035cee774c"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be9824c1c874b73b96288c6d3de793bf7f3a597770205068c6163ea1f326e8b9"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2c3b2803e730dc2797a017335827e9da6da0e84c745ce0f552e66400abdfb9a1"}, + {file = "coverage-7.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f69770f5ca1994cb32c38965e95f57504d3aea96b6c024624fdd5bb1aa494a1"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1127b16220f7bfb3f1049ed4a62d26d81970a723544e8252db0efde853268e21"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:aa784405f0c640940595fa0f14064d8e84aff0b0f762fa18393e2760a2cf5841"}, + {file = "coverage-7.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3146b8e16fa60427e03884301bf8209221f5761ac754ee6b267642a2fd354c48"}, + {file = "coverage-7.2.3-cp37-cp37m-win32.whl", hash = "sha256:1fd78b911aea9cec3b7e1e2622c8018d51c0d2bbcf8faaf53c2497eb114911c1"}, + {file = "coverage-7.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0f3736a5d34e091b0a611964c6262fd68ca4363df56185902528f0b75dbb9c1f"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:981b4df72c93e3bc04478153df516d385317628bd9c10be699c93c26ddcca8ab"}, + {file = "coverage-7.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c0045f8f23a5fb30b2eb3b8a83664d8dc4fb58faddf8155d7109166adb9f2040"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f760073fcf8f3d6933178d67754f4f2d4e924e321f4bb0dcef0424ca0215eba1"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c86bd45d1659b1ae3d0ba1909326b03598affbc9ed71520e0ff8c31a993ad911"}, + {file = "coverage-7.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:172db976ae6327ed4728e2507daf8a4de73c7cc89796483e0a9198fd2e47b462"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:d2a3a6146fe9319926e1d477842ca2a63fe99af5ae690b1f5c11e6af074a6b5c"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f649dd53833b495c3ebd04d6eec58479454a1784987af8afb77540d6c1767abd"}, + {file = "coverage-7.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7c4ed4e9f3b123aa403ab424430b426a1992e6f4c8fd3cb56ea520446e04d152"}, + {file = "coverage-7.2.3-cp38-cp38-win32.whl", hash = "sha256:eb0edc3ce9760d2f21637766c3aa04822030e7451981ce569a1b3456b7053f22"}, + {file = "coverage-7.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:63cdeaac4ae85a179a8d6bc09b77b564c096250d759eed343a89d91bce8b6367"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:20d1a2a76bb4eb00e4d36b9699f9b7aba93271c9c29220ad4c6a9581a0320235"}, + {file = "coverage-7.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ea748802cc0de4de92ef8244dd84ffd793bd2e7be784cd8394d557a3c751e21"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21b154aba06df42e4b96fc915512ab39595105f6c483991287021ed95776d934"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd214917cabdd6f673a29d708574e9fbdb892cb77eb426d0eae3490d95ca7859"}, + {file = "coverage-7.2.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2e58e45fe53fab81f85474e5d4d226eeab0f27b45aa062856c89389da2f0d9"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:87ecc7c9a1a9f912e306997ffee020297ccb5ea388421fe62a2a02747e4d5539"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:387065e420aed3c71b61af7e82c7b6bc1c592f7e3c7a66e9f78dd178699da4fe"}, + {file = "coverage-7.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ea3f5bc91d7d457da7d48c7a732beaf79d0c8131df3ab278e6bba6297e23c6c4"}, + {file = "coverage-7.2.3-cp39-cp39-win32.whl", hash = "sha256:ae7863a1d8db6a014b6f2ff9c1582ab1aad55a6d25bac19710a8df68921b6e30"}, + {file = "coverage-7.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:3f04becd4fcda03c0160d0da9c8f0c246bc78f2f7af0feea1ec0930e7c93fa4a"}, + {file = "coverage-7.2.3-pp37.pp38.pp39-none-any.whl", hash = "sha256:965ee3e782c7892befc25575fa171b521d33798132692df428a09efacaffe8d0"}, + {file = "coverage-7.2.3.tar.gz", hash = "sha256:d298c2815fa4891edd9abe5ad6e6cb4207104c7dd9fd13aea3fdebf6f9b91259"}, +] + +[package.extras] +toml = ["tomli"] + +[[package]] +name = "dill" +version = "0.3.6" +description = "serialize all of python" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "dill-0.3.6-py3-none-any.whl", hash = "sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0"}, + {file = "dill-0.3.6.tar.gz", hash = "sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373"}, +] + +[package.extras] +graph = ["objgraph (>=1.7.2)"] + +[[package]] +name = "isort" +version = "5.11.5" +description = "A Python utility / library to sort Python imports." +category = "dev" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "isort-5.11.5-py3-none-any.whl", hash = "sha256:ba1d72fb2595a01c7895a5128f9585a5cc4b6d395f1c8d514989b9a7eb2a8746"}, + {file = "isort-5.11.5.tar.gz", hash = "sha256:6be1f76a507cb2ecf16c7cf14a37e41609ca082330be4e3436a18ef74add55db"}, +] + +[package.extras] +colors = ["colorama (>=0.4.3,<0.5.0)"] +pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + +[[package]] +name = "jedi" +version = "0.17.2" +description = "An autocompletion tool for Python that can be used for text editors." +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "jedi-0.17.2-py2.py3-none-any.whl", hash = "sha256:98cc583fa0f2f8304968199b01b6b4b94f469a1f4a74c1560506ca2a211378b5"}, + {file = "jedi-0.17.2.tar.gz", hash = "sha256:86ed7d9b750603e4ba582ea8edc678657fb4007894a12bcf6f4bb97892f31d20"}, +] + +[package.dependencies] +parso = ">=0.7.0,<0.8.0" + +[package.extras] +qa = ["flake8 (==3.7.9)"] +testing = ["Django (<3.1)", "colorama", "docopt", "pytest (>=3.9.0,<5.0.0)"] + +[[package]] +name = "lazy-object-proxy" +version = "1.9.0" +description = "A fast and thorough lazy object proxy." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "lazy-object-proxy-1.9.0.tar.gz", hash = "sha256:659fb5809fa4629b8a1ac5106f669cfc7bef26fbb389dda53b3e010d1ac4ebae"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b40387277b0ed2d0602b8293b94d7257e17d1479e257b4de114ea11a8cb7f2d7"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8c6cfb338b133fbdbc5cfaa10fe3c6aeea827db80c978dbd13bc9dd8526b7d4"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:721532711daa7db0d8b779b0bb0318fa87af1c10d7fe5e52ef30f8eff254d0cd"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:66a3de4a3ec06cd8af3f61b8e1ec67614fbb7c995d02fa224813cb7afefee701"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1aa3de4088c89a1b69f8ec0dcc169aa725b0ff017899ac568fe44ddc1396df46"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-win32.whl", hash = "sha256:f0705c376533ed2a9e5e97aacdbfe04cecd71e0aa84c7c0595d02ef93b6e4455"}, + {file = "lazy_object_proxy-1.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea806fd4c37bf7e7ad82537b0757999264d5f70c45468447bb2b91afdbe73a6e"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:946d27deaff6cf8452ed0dba83ba38839a87f4f7a9732e8f9fd4107b21e6ff07"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a31b086e7e68b24b99b23d57723ef7e2c6d81ed21007b6281ebcd1688acb0a"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f699ac1c768270c9e384e4cbd268d6e67aebcfae6cd623b4d7c3bfde5a35db59"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfb38f9ffb53b942f2b5954e0f610f1e721ccebe9cce9025a38c8ccf4a5183a4"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:189bbd5d41ae7a498397287c408617fe5c48633e7755287b21d741f7db2706a9"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-win32.whl", hash = "sha256:81fc4d08b062b535d95c9ea70dbe8a335c45c04029878e62d744bdced5141586"}, + {file = "lazy_object_proxy-1.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:f2457189d8257dd41ae9b434ba33298aec198e30adf2dcdaaa3a28b9994f6adb"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d9e25ef10a39e8afe59a5c348a4dbf29b4868ab76269f81ce1674494e2565a6e"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cbf9b082426036e19c6924a9ce90c740a9861e2bdc27a4834fd0a910742ac1e8"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5fa4a61ce2438267163891961cfd5e32ec97a2c444e5b842d574251ade27d2"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8fa02eaab317b1e9e03f69aab1f91e120e7899b392c4fc19807a8278a07a97e8"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e7c21c95cae3c05c14aafffe2865bbd5e377cfc1348c4f7751d9dc9a48ca4bda"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win32.whl", hash = "sha256:f12ad7126ae0c98d601a7ee504c1122bcef553d1d5e0c3bfa77b16b3968d2734"}, + {file = "lazy_object_proxy-1.9.0-cp37-cp37m-win_amd64.whl", hash = "sha256:edd20c5a55acb67c7ed471fa2b5fb66cb17f61430b7a6b9c3b4a1e40293b1671"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2d0daa332786cf3bb49e10dc6a17a52f6a8f9601b4cf5c295a4f85854d61de63"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cd077f3d04a58e83d04b20e334f678c2b0ff9879b9375ed107d5d07ff160171"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:660c94ea760b3ce47d1855a30984c78327500493d396eac4dfd8bd82041b22be"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:212774e4dfa851e74d393a2370871e174d7ff0ebc980907723bb67d25c8a7c30"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f0117049dd1d5635bbff65444496c90e0baa48ea405125c088e93d9cf4525b11"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-win32.whl", hash = "sha256:0a891e4e41b54fd5b8313b96399f8b0e173bbbfc03c7631f01efbe29bb0bcf82"}, + {file = "lazy_object_proxy-1.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:9990d8e71b9f6488e91ad25f322898c136b008d87bf852ff65391b004da5e17b"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e7551208b2aded9c1447453ee366f1c4070602b3d932ace044715d89666899b"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f83ac4d83ef0ab017683d715ed356e30dd48a93746309c8f3517e1287523ef4"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7322c3d6f1766d4ef1e51a465f47955f1e8123caee67dd641e67d539a534d006"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:18b78ec83edbbeb69efdc0e9c1cb41a3b1b1ed11ddd8ded602464c3fc6020494"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:09763491ce220c0299688940f8dc2c5d05fd1f45af1e42e636b2e8b2303e4382"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-win32.whl", hash = "sha256:9090d8e53235aa280fc9239a86ae3ea8ac58eff66a705fa6aa2ec4968b95c821"}, + {file = "lazy_object_proxy-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:db1c1722726f47e10e0b5fdbf15ac3b8adb58c091d12b3ab713965795036985f"}, +] + +[[package]] +name = "mamba" +version = "0.11.2" +description = "The definitive testing tool for Python. Born under the banner of Behavior Driven Development." +category = "dev" +optional = false +python-versions = "*" +files = [ + {file = "mamba-0.11.2.tar.gz", hash = "sha256:75cfc6dfd287dcccaf86dd753cf48e0a7337487c7c3fafda05a6a67ded6da496"}, +] + +[package.dependencies] +clint = "*" +coverage = "*" + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + +[[package]] +name = "parso" +version = "0.7.1" +description = "A Python Parser" +category = "dev" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "parso-0.7.1-py2.py3-none-any.whl", hash = "sha256:97218d9159b2520ff45eb78028ba8b50d2bc61dcc062a9682666f2dc4bd331ea"}, + {file = "parso-0.7.1.tar.gz", hash = "sha256:caba44724b994a8a5e086460bb212abc5a8bc46951bf4a9a1210745953622eb9"}, +] + +[package.extras] +testing = ["docopt", "pytest (>=3.0.7)"] + +[[package]] +name = "platformdirs" +version = "3.2.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "platformdirs-3.2.0-py3-none-any.whl", hash = "sha256:ebe11c0d7a805086e99506aa331612429a72ca7cd52a1f0d277dc4adc20cb10e"}, + {file = "platformdirs-3.2.0.tar.gz", hash = "sha256:d5b638ca397f25f979350ff789db335903d7ea010ab28903f57b27e1b16c2b08"}, +] + +[package.dependencies] +typing-extensions = {version = ">=4.5", markers = "python_version < \"3.8\""} + +[package.extras] +docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.2.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] + +[[package]] +name = "pylint" +version = "2.13.9" +description = "python code static checker" +category = "dev" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "pylint-2.13.9-py3-none-any.whl", hash = "sha256:705c620d388035bdd9ff8b44c5bcdd235bfb49d276d488dd2c8ff1736aa42526"}, + {file = "pylint-2.13.9.tar.gz", hash = "sha256:095567c96e19e6f57b5b907e67d265ff535e588fe26b12b5ebe1fc5645b2c731"}, +] + +[package.dependencies] +astroid = ">=2.11.5,<=2.12.0-dev0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +dill = ">=0.2" +isort = ">=4.2.5,<6" +mccabe = ">=0.6,<0.8" +platformdirs = ">=2.2.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +testutil = ["gitpython (>3)"] + +[[package]] +name = "pymupdf" +version = "1.22.1" +description = "Python bindings for the PDF toolkit and renderer MuPDF" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6bda7a64a1263f1c2b6421ae8803db50d4c8a67de95e05d7a38c313de913b0de"}, + {file = "PyMuPDF-1.22.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b5f62ad244b04b7aa5e7d50b06b8bbc582b2f1d0f2c66013051463d63dfe6c5e"}, + {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce633b9d522528959988647dfbd2c9144ad5422dd75e89e60039da36a412fd3c"}, + {file = "PyMuPDF-1.22.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:733e7b87765ea55202b042b7c84c6b94185ee29fe3a2bd2ee02681c0fd584033"}, + {file = "PyMuPDF-1.22.1-cp310-cp310-win32.whl", hash = "sha256:701499f0a17ccc8dd80707dbeb3a2e60657a6bdc05be7c8c69fa60eb134e1805"}, + {file = "PyMuPDF-1.22.1-cp310-cp310-win_amd64.whl", hash = "sha256:81fa90d157ef7b2ecd72eedafe9db56d3b0f8c3b392d7a2057f659bfcc1f7cad"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4edac1dd8e5c35b55420925b5486bec4427b07a073cd03f6081b7234ed37217e"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7744b9853fc55df75f6d37a376432eddd450c1d2072f6ef66b392b7229bccdc6"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:711adc70d664cdd5d361154bb3485546eaa5e8a90827db6abf9c42ca292aa9e1"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d77a3057ad7fc3e2e02e5fedd53199206a49c4b4c5e3ee75458c17d6b739cb"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-win32.whl", hash = "sha256:b5eca48ea55eafcea68b14669a9f5030c15056431b10710d863de9f9a6b1a0ce"}, + {file = "PyMuPDF-1.22.1-cp311-cp311-win_amd64.whl", hash = "sha256:8e0bfbd6195f45326f9182fff04ac2af9568d78fc1f32dcfa15f84a302d8aafe"}, + {file = "PyMuPDF-1.22.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:440efca115e70c8cdfc492e98b182e24c565d8e68f26754e28e61cf108a915d9"}, + {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a70ab2d38b366c7237adce7d54f3028a7825f165a73c137a1746a6b592d26bb2"}, + {file = "PyMuPDF-1.22.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e4a924ffecb8046fbfe7dff9b69f9938389f094dccab07a378850bf9f889c62"}, + {file = "PyMuPDF-1.22.1-cp37-cp37m-win32.whl", hash = "sha256:24e66c2ff4d6cfee5b082c3e2c92b40214799888bf2efcca1f70108c3dfedddb"}, + {file = "PyMuPDF-1.22.1-cp37-cp37m-win_amd64.whl", hash = "sha256:51504bfa2ee207c5c1a38d47b4b91af1bacbd8937b959d947d81fc8f7e023bd8"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:219337a3be00df2bf65071d5e4e1e6759afd06310d4ec7b1c9694a5b03b5d8d6"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:050719cb42a8847d564af1d8509d7290176e7c4fde6da7be5751303fa8237aed"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5871b9e38e68b92533fb7c6fbe3eb7b059f5071d4c2e3ff51cedcc73c994afbc"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5a0332d6dac4ebf32cb7f0c8639b22b56c9475cb87bc0a0361f9cdc9c2d08a1"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-win32.whl", hash = "sha256:127985812c4a2f0106375c4f4916ca68c1559d6b224a050ce75393e454333995"}, + {file = "PyMuPDF-1.22.1-cp38-cp38-win_amd64.whl", hash = "sha256:99764c46fb8df253a3ea9fbb13b132f205561d6227b0d00e673998b18d7280eb"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fdb21332d28567e278008dd6130564ac0f5de8aff364a1e7809a70a0f969df26"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:88202e42d957a41deff212dcb1d8e16e469d21d09a72ab372ee2f173a22112c8"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36b7fd85f5813045f10b65caf4cbdad03b51b07076f07b205853a1e44c898e34"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45e601f7b1ee2a0c1a261bb0179eba4a9899117404eccf0a573e6497ed507ea8"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-win32.whl", hash = "sha256:c610acdbd2f2d994130341559f26c098df546a1fc187adee3b63a0f489310808"}, + {file = "PyMuPDF-1.22.1-cp39-cp39-win_amd64.whl", hash = "sha256:af1e6d5dd122c097f23a7e89f8c2197310e85a4c8e8f63ff94444188d9bc0a4e"}, + {file = "PyMuPDF-1.22.1.tar.gz", hash = "sha256:ad34bba78ce147cee50e1dc30fa16f29135a4c3d6a2b1c1b0403ebbcc9fbe4be"}, +] + +[[package]] +name = "setuptools" +version = "67.7.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "setuptools-67.7.0-py3-none-any.whl", hash = "sha256:888be97fde8cc3afd60f7784e678fa29ee13c4e5362daa7104a93bba33646c50"}, + {file = "setuptools-67.7.0.tar.gz", hash = "sha256:b7e53a01c6c654d26d2999ee033d8c6125e5fa55f03b7b193f937ae7ac999f22"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +category = "main" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "typed-ast" +version = "1.5.4" +description = "a fork of Python 2 and 3 ast modules with type comment support" +category = "dev" +optional = false +python-versions = ">=3.6" +files = [ + {file = "typed_ast-1.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:669dd0c4167f6f2cd9f57041e03c3c2ebf9063d0757dc89f79ba1daa2bfca9d4"}, + {file = "typed_ast-1.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:211260621ab1cd7324e0798d6be953d00b74e0428382991adfddb352252f1d62"}, + {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:267e3f78697a6c00c689c03db4876dd1efdfea2f251a5ad6555e82a26847b4ac"}, + {file = "typed_ast-1.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:c542eeda69212fa10a7ada75e668876fdec5f856cd3d06829e6aa64ad17c8dfe"}, + {file = "typed_ast-1.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:a9916d2bb8865f973824fb47436fa45e1ebf2efd920f2b9f99342cb7fab93f72"}, + {file = "typed_ast-1.5.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:79b1e0869db7c830ba6a981d58711c88b6677506e648496b1f64ac7d15633aec"}, + {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a94d55d142c9265f4ea46fab70977a1944ecae359ae867397757d836ea5a3f47"}, + {file = "typed_ast-1.5.4-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:183afdf0ec5b1b211724dfef3d2cad2d767cbefac291f24d69b00546c1837fb6"}, + {file = "typed_ast-1.5.4-cp36-cp36m-win_amd64.whl", hash = "sha256:639c5f0b21776605dd6c9dbe592d5228f021404dafd377e2b7ac046b0349b1a1"}, + {file = "typed_ast-1.5.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cf4afcfac006ece570e32d6fa90ab74a17245b83dfd6655a6f68568098345ff6"}, + {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed855bbe3eb3715fca349c80174cfcfd699c2f9de574d40527b8429acae23a66"}, + {file = "typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:6778e1b2f81dfc7bc58e4b259363b83d2e509a65198e85d5700dfae4c6c8ff1c"}, + {file = "typed_ast-1.5.4-cp37-cp37m-win_amd64.whl", hash = "sha256:0261195c2062caf107831e92a76764c81227dae162c4f75192c0d489faf751a2"}, + {file = "typed_ast-1.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2efae9db7a8c05ad5547d522e7dbe62c83d838d3906a3716d1478b6c1d61388d"}, + {file = "typed_ast-1.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7d5d014b7daa8b0bf2eaef684295acae12b036d79f54178b92a2b6a56f92278f"}, + {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:370788a63915e82fd6f212865a596a0fefcbb7d408bbbb13dea723d971ed8bdc"}, + {file = "typed_ast-1.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:4e964b4ff86550a7a7d56345c7864b18f403f5bd7380edf44a3c1fb4ee7ac6c6"}, + {file = "typed_ast-1.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:683407d92dc953c8a7347119596f0b0e6c55eb98ebebd9b23437501b28dcbb8e"}, + {file = "typed_ast-1.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4879da6c9b73443f97e731b617184a596ac1235fe91f98d279a7af36c796da35"}, + {file = "typed_ast-1.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3e123d878ba170397916557d31c8f589951e353cc95fb7f24f6bb69adc1a8a97"}, + {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebd9d7f80ccf7a82ac5f88c521115cc55d84e35bf8b446fcd7836eb6b98929a3"}, + {file = "typed_ast-1.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98f80dee3c03455e92796b58b98ff6ca0b2a6f652120c263efdba4d6c5e58f72"}, + {file = "typed_ast-1.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:0fdbcf2fef0ca421a3f5912555804296f0b0960f0418c440f5d6d3abb549f3e1"}, + {file = "typed_ast-1.5.4.tar.gz", hash = "sha256:39e21ceb7388e4bb37f4c679d72707ed46c2fbf2a5609b8b8ebc4b067d977df2"}, +] + +[[package]] +name = "typing-extensions" +version = "4.5.0" +description = "Backported and Experimental Type Hints for Python 3.7+" +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, + {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, +] + +[[package]] +name = "wrapt" +version = "1.15.0" +description = "Module for decorators, wrappers and monkey patching." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +files = [ + {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"}, + {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"}, + {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"}, + {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:96e25c8603a155559231c19c0349245eeb4ac0096fe3c1d0be5c47e075bd4f46"}, + {file = "wrapt-1.15.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:40737a081d7497efea35ab9304b829b857f21558acfc7b3272f908d33b0d9d4c"}, + {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:f87ec75864c37c4c6cb908d282e1969e79763e0d9becdfe9fe5473b7bb1e5f09"}, + {file = "wrapt-1.15.0-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:1286eb30261894e4c70d124d44b7fd07825340869945c79d05bda53a40caa079"}, + {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:493d389a2b63c88ad56cdc35d0fa5752daac56ca755805b1b0c530f785767d5e"}, + {file = "wrapt-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:58d7a75d731e8c63614222bcb21dd992b4ab01a399f1f09dd82af17bbfc2368a"}, + {file = "wrapt-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:21f6d9a0d5b3a207cdf7acf8e58d7d13d463e639f0c7e01d82cdb671e6cb7923"}, + {file = "wrapt-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ce42618f67741d4697684e501ef02f29e758a123aa2d669e2d964ff734ee00ee"}, + {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41d07d029dd4157ae27beab04d22b8e261eddfc6ecd64ff7000b10dc8b3a5727"}, + {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54accd4b8bc202966bafafd16e69da9d5640ff92389d33d28555c5fd4f25ccb7"}, + {file = "wrapt-1.15.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fbfbca668dd15b744418265a9607baa970c347eefd0db6a518aaf0cfbd153c0"}, + {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:76e9c727a874b4856d11a32fb0b389afc61ce8aaf281ada613713ddeadd1cfec"}, + {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e20076a211cd6f9b44a6be58f7eeafa7ab5720eb796975d0c03f05b47d89eb90"}, + {file = "wrapt-1.15.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a74d56552ddbde46c246b5b89199cb3fd182f9c346c784e1a93e4dc3f5ec9975"}, + {file = "wrapt-1.15.0-cp310-cp310-win32.whl", hash = "sha256:26458da5653aa5b3d8dc8b24192f574a58984c749401f98fff994d41d3f08da1"}, + {file = "wrapt-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:75760a47c06b5974aa5e01949bf7e66d2af4d08cb8c1d6516af5e39595397f5e"}, + {file = "wrapt-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ba1711cda2d30634a7e452fc79eabcadaffedf241ff206db2ee93dd2c89a60e7"}, + {file = "wrapt-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:56374914b132c702aa9aa9959c550004b8847148f95e1b824772d453ac204a72"}, + {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a89ce3fd220ff144bd9d54da333ec0de0399b52c9ac3d2ce34b569cf1a5748fb"}, + {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bbe623731d03b186b3d6b0d6f51865bf598587c38d6f7b0be2e27414f7f214e"}, + {file = "wrapt-1.15.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3abbe948c3cbde2689370a262a8d04e32ec2dd4f27103669a45c6929bcdbfe7c"}, + {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b67b819628e3b748fd3c2192c15fb951f549d0f47c0449af0764d7647302fda3"}, + {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7eebcdbe3677e58dd4c0e03b4f2cfa346ed4049687d839adad68cc38bb559c92"}, + {file = "wrapt-1.15.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:74934ebd71950e3db69960a7da29204f89624dde411afbfb3b4858c1409b1e98"}, + {file = "wrapt-1.15.0-cp311-cp311-win32.whl", hash = "sha256:bd84395aab8e4d36263cd1b9308cd504f6cf713b7d6d3ce25ea55670baec5416"}, + {file = "wrapt-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:a487f72a25904e2b4bbc0817ce7a8de94363bd7e79890510174da9d901c38705"}, + {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:4ff0d20f2e670800d3ed2b220d40984162089a6e2c9646fdb09b85e6f9a8fc29"}, + {file = "wrapt-1.15.0-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:9ed6aa0726b9b60911f4aed8ec5b8dd7bf3491476015819f56473ffaef8959bd"}, + {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:896689fddba4f23ef7c718279e42f8834041a21342d95e56922e1c10c0cc7afb"}, + {file = "wrapt-1.15.0-cp35-cp35m-manylinux2010_x86_64.whl", hash = "sha256:75669d77bb2c071333417617a235324a1618dba66f82a750362eccbe5b61d248"}, + {file = "wrapt-1.15.0-cp35-cp35m-win32.whl", hash = "sha256:fbec11614dba0424ca72f4e8ba3c420dba07b4a7c206c8c8e4e73f2e98f4c559"}, + {file = "wrapt-1.15.0-cp35-cp35m-win_amd64.whl", hash = "sha256:fd69666217b62fa5d7c6aa88e507493a34dec4fa20c5bd925e4bc12fce586639"}, + {file = "wrapt-1.15.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b0724f05c396b0a4c36a3226c31648385deb6a65d8992644c12a4963c70326ba"}, + {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bbeccb1aa40ab88cd29e6c7d8585582c99548f55f9b2581dfc5ba68c59a85752"}, + {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38adf7198f8f154502883242f9fe7333ab05a5b02de7d83aa2d88ea621f13364"}, + {file = "wrapt-1.15.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:578383d740457fa790fdf85e6d346fda1416a40549fe8db08e5e9bd281c6a475"}, + {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:a4cbb9ff5795cd66f0066bdf5947f170f5d63a9274f99bdbca02fd973adcf2a8"}, + {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:af5bd9ccb188f6a5fdda9f1f09d9f4c86cc8a539bd48a0bfdc97723970348418"}, + {file = "wrapt-1.15.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:b56d5519e470d3f2fe4aa7585f0632b060d532d0696c5bdfb5e8319e1d0f69a2"}, + {file = "wrapt-1.15.0-cp36-cp36m-win32.whl", hash = "sha256:77d4c1b881076c3ba173484dfa53d3582c1c8ff1f914c6461ab70c8428b796c1"}, + {file = "wrapt-1.15.0-cp36-cp36m-win_amd64.whl", hash = "sha256:077ff0d1f9d9e4ce6476c1a924a3332452c1406e59d90a2cf24aeb29eeac9420"}, + {file = "wrapt-1.15.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5c5aa28df055697d7c37d2099a7bc09f559d5053c3349b1ad0c39000e611d317"}, + {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a8564f283394634a7a7054b7983e47dbf39c07712d7b177b37e03f2467a024e"}, + {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:780c82a41dc493b62fc5884fb1d3a3b81106642c5c5c78d6a0d4cbe96d62ba7e"}, + {file = "wrapt-1.15.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e169e957c33576f47e21864cf3fc9ff47c223a4ebca8960079b8bd36cb014fd0"}, + {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b02f21c1e2074943312d03d243ac4388319f2456576b2c6023041c4d57cd7019"}, + {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f2e69b3ed24544b0d3dbe2c5c0ba5153ce50dcebb576fdc4696d52aa22db6034"}, + {file = "wrapt-1.15.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d787272ed958a05b2c86311d3a4135d3c2aeea4fc655705f074130aa57d71653"}, + {file = "wrapt-1.15.0-cp37-cp37m-win32.whl", hash = "sha256:02fce1852f755f44f95af51f69d22e45080102e9d00258053b79367d07af39c0"}, + {file = "wrapt-1.15.0-cp37-cp37m-win_amd64.whl", hash = "sha256:abd52a09d03adf9c763d706df707c343293d5d106aea53483e0ec8d9e310ad5e"}, + {file = "wrapt-1.15.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdb4f085756c96a3af04e6eca7f08b1345e94b53af8921b25c72f096e704e145"}, + {file = "wrapt-1.15.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:230ae493696a371f1dbffaad3dafbb742a4d27a0afd2b1aecebe52b740167e7f"}, + {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:63424c681923b9f3bfbc5e3205aafe790904053d42ddcc08542181a30a7a51bd"}, + {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6bcbfc99f55655c3d93feb7ef3800bd5bbe963a755687cbf1f490a71fb7794b"}, + {file = "wrapt-1.15.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99f4309f5145b93eca6e35ac1a988f0dc0a7ccf9ccdcd78d3c0adf57224e62f"}, + {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b130fe77361d6771ecf5a219d8e0817d61b236b7d8b37cc045172e574ed219e6"}, + {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:96177eb5645b1c6985f5c11d03fc2dbda9ad24ec0f3a46dcce91445747e15094"}, + {file = "wrapt-1.15.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5fe3e099cf07d0fb5a1e23d399e5d4d1ca3e6dfcbe5c8570ccff3e9208274f7"}, + {file = "wrapt-1.15.0-cp38-cp38-win32.whl", hash = "sha256:abd8f36c99512755b8456047b7be10372fca271bf1467a1caa88db991e7c421b"}, + {file = "wrapt-1.15.0-cp38-cp38-win_amd64.whl", hash = "sha256:b06fa97478a5f478fb05e1980980a7cdf2712015493b44d0c87606c1513ed5b1"}, + {file = "wrapt-1.15.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2e51de54d4fb8fb50d6ee8327f9828306a959ae394d3e01a1ba8b2f937747d86"}, + {file = "wrapt-1.15.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0970ddb69bba00670e58955f8019bec4a42d1785db3faa043c33d81de2bf843c"}, + {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76407ab327158c510f44ded207e2f76b657303e17cb7a572ffe2f5a8a48aa04d"}, + {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd525e0e52a5ff16653a3fc9e3dd827981917d34996600bbc34c05d048ca35cc"}, + {file = "wrapt-1.15.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d37ac69edc5614b90516807de32d08cb8e7b12260a285ee330955604ed9dd29"}, + {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:078e2a1a86544e644a68422f881c48b84fef6d18f8c7a957ffd3f2e0a74a0d4a"}, + {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2cf56d0e237280baed46f0b5316661da892565ff58309d4d2ed7dba763d984b8"}, + {file = "wrapt-1.15.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7dc0713bf81287a00516ef43137273b23ee414fe41a3c14be10dd95ed98a2df9"}, + {file = "wrapt-1.15.0-cp39-cp39-win32.whl", hash = "sha256:46ed616d5fb42f98630ed70c3529541408166c22cdfd4540b88d5f21006b0eff"}, + {file = "wrapt-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:eef4d64c650f33347c1f9266fa5ae001440b232ad9b98f1f43dfe7a79435c0a6"}, + {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"}, + {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"}, +] + +[metadata] +lock-version = "2.0" +python-versions = "^3.7" +content-hash = "6dd48af9ea10e0d441e2b6ee3dcdea67bd5b4cc0b6c13b672761212decbaa5f6" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..ac1e2619053761b11ab2f5e246587fe4e96e1a60 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[tool.poetry] +name = "pdf.tocgen" +version = "1.3.4" +description = "Automatically generate table of contents for pdf files" +authors = ["krasjet"] +license = "GPL-3.0-or-later" +readme = "README.md" +homepage = "https://krasjet.com/voice/pdf.tocgen/" +repository = "https://github.com/Krasjet/pdf.tocgen" +keywords = ["pdf", "cli"] + +classifiers = [ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: End Users/Desktop" +] + +packages = [ + { include = "pdfxmeta" }, + { include = "pdftocgen" }, + { include = "pdftocio" }, + { include = "fitzutils" } +] + +[tool.poetry.dependencies] +python = "^3.7" +PyMuPDF = "^1.18.14" +toml = "^0.10.2" +chardet = "^5.1.0" + +[tool.poetry.dev-dependencies] +pylint = "^2.5.3" +jedi = "^0.17.2" +mamba = "^0.11.1" + +[tool.poetry.scripts] +pdfxmeta = "pdfxmeta.app:main" +pdftocgen = "pdftocgen.app:main" +pdftocio = "pdftocio.app:main" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/recipes/README.md b/recipes/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3c394485bc24eba8adf10a98f4492aac0a3acb11 --- /dev/null +++ b/recipes/README.md @@ -0,0 +1,11 @@ +recipes +======= + +This directory contains some pre-made recipes for `pdftocgen`. It could be a +good reference if you want to craft your own recipes. Feel free to contribute +more. + +The recipes in this directory is separately licensed under the [CC BY-NC-SA 4.0 +License][cc] to prevent any commercial usage. + +[cc]: https://creativecommons.org/licenses/by-nc-sa/4.0/ diff --git a/recipes/default_groff_man.toml b/recipes/default_groff_man.toml new file mode 100644 index 0000000000000000000000000000000000000000..ab3a4690848d38c09d643a7ae01f5f6622768ade --- /dev/null +++ b/recipes/default_groff_man.toml @@ -0,0 +1,12 @@ +# The recipe for +# $ man -Tpdf man > out.pdf +# only tested under groff +[[heading]] +level = 1 +font.name = "Times-Bold" +font.size = 10.949999809265137 +font.superscript = false +font.italic = false +font.serif = true +font.monospace = false +font.bold = true diff --git a/recipes/default_groff_ms.toml b/recipes/default_groff_ms.toml new file mode 100644 index 0000000000000000000000000000000000000000..9e59a1daf1f8c42b3aceeee59700419b905a1017 --- /dev/null +++ b/recipes/default_groff_ms.toml @@ -0,0 +1,12 @@ +# The recipe for the default groff_ms, produced by +# $ groff -ms -Tpdf in.ms > out.pdf + +[[heading]] +level = 1 +font.name = "Times-Bold" +font.size = 10 +bbox.left = 72 + +# All the headings (.NH) have the same font attributes, so you need to manually +# format the heading levels of the toc (for vim users, >> in normal mode will +# add indentation to a line) diff --git a/recipes/default_latex.toml b/recipes/default_latex.toml new file mode 100644 index 0000000000000000000000000000000000000000..f79bacf0d0c7e2b431710240f2e3ff0eb6de1f8a --- /dev/null +++ b/recipes/default_latex.toml @@ -0,0 +1,24 @@ +# The recipe for +# $ pdflatex in.tex +# under default styles (Computer Modern, article class) + +[[heading]] +level = 1 +greedy = true +font.name = "CMBX12" +font.size = 14.346199989318848 +font.size_tolerance = 0.01 + +[[heading]] +level = 2 +greedy = true +font.name = "CMBX12" +font.size = 11.9552001953125 +font.size_tolerance = 0.01 + +[[heading]] +level = 3 +greedy = true +font.name = "CMBX10" +font.size = 9.962599754333496 +font.size_tolerance = 0.01 diff --git a/recipes/ft.toml b/recipes/ft.toml new file mode 100644 index 0000000000000000000000000000000000000000..eb59c5638bde1ea0a360fa0065c37cc55adeb431 --- /dev/null +++ b/recipes/ft.toml @@ -0,0 +1,23 @@ +# The recipe for "Lecture Notes for EE 261" [1] by Brad Osgood +# +# [1]: https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf +# archive: https://web.archive.org/https://see.stanford.edu/materials/lsoftaee261/book-fall-07.pdf + +[[heading]] +level = 1 +greedy = true +font.name = "CMBX12" +font.size = 24.78696060180664 + +[[heading]] +level = 2 +greedy = true +font.name = "CMBX12" +font.size = 14.346190452575684 + +[[heading]] +level = 3 +greedy = true +font.name = "CMBX12" +font.size = 11.955169677734375 + diff --git a/recipes/htdc.toml b/recipes/htdc.toml new file mode 100644 index 0000000000000000000000000000000000000000..3783190ace58e29965265b9611ad95887b612fd4 --- /dev/null +++ b/recipes/htdc.toml @@ -0,0 +1,26 @@ +# The recipe for HtDC by Matthias Felleisen, et al. +# +# The output need some manual clean up. For example, the table of contents in +# the original document is incorrectedly included in the outline, but they +# should be easy to remove using a text editor. +# +# [1]: https://felleisen.org/matthias/HtDC/htdc.pdf + +[[heading]] +level = 1 +font.name = "Palatino-Bold" +font.size = 17.21540069580078 +font.color = 0x221f1f + +[[heading]] +level = 2 +font.name = "Palatino-Bold" +font.size = 14.346199989318848 +font.color = 0x221f1f + +[[heading]] +level = 3 +greedy = true +font.name = "Palatino-Bold" +font.size = 11.9552001953125 +font.color = 0x221f1f diff --git a/recipes/onlisp.toml b/recipes/onlisp.toml new file mode 100644 index 0000000000000000000000000000000000000000..00873f96b8f6bd82e964419ba0e75bbcdc6b9e48 --- /dev/null +++ b/recipes/onlisp.toml @@ -0,0 +1,15 @@ +# The recipe for "On Lisp" [1] by Paul Graham +# +# Note that you need to download the PDF version. The PDF is well structured +# and no extra processing is needed. +# [1]: http://www.paulgraham.com/onlisptext.html + +[[heading]] +level = 1 +font.name = "Times-Bold" +font.size = 19.92530059814453 + +[[heading]] +level = 2 +font.name = "Times-Bold" +font.size = 11.9552001953125 diff --git a/recipes/recipe.toml b/recipes/recipe.toml new file mode 100644 index 0000000000000000000000000000000000000000..85a5af2253d5f4db045c3d79c79877277e533e81 --- /dev/null +++ b/recipes/recipe.toml @@ -0,0 +1,5 @@ +[[heading]] +level = 1 +greedy = true +font.name = "CaslonFiveForty-Roman" +font.size = 54.10 diff --git a/requirements.txt b/requirements.txt index 28d994e22f8dd432b51df193562052e315ad95f7..1813feaee654ded47884902fd56ffa99962c83d2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ -altair -pandas -streamlit \ No newline at end of file +streamlit +pandas +PyMuPDF==1.25.2 +toml +chardet +. diff --git a/spec/__init__.py b/spec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/spec/cli_spec.sh b/spec/cli_spec.sh new file mode 100644 index 0000000000000000000000000000000000000000..d65e40dc3e40ddfa5bad92970f2e2f413e8a8dc2 --- /dev/null +++ b/spec/cli_spec.sh @@ -0,0 +1,63 @@ +#!/bin/bash -e + +SPEC="spec/files" + +checkeq() { + if res=$(diff "$1" "$2"); then + echo "[✓]" + else + echo "[✗]" + printf "%s\n" "$res" + return 1 + fi +} + +it() { + printf " it %s " "$*" +} + +printf "pdfxmeta\n" + +it "extracts metadata correctly" +checkeq <(pdfxmeta -p 1 "$SPEC/level2.pdf" "Section") \ + "$SPEC/level2_meta" + +it "extracts metadata in auto mode correctly" +checkeq <(pdfxmeta -a 1 -p 1 "$SPEC/level2.pdf" "Section") \ + "$SPEC/level2_meta.toml" + +printf "\npdftocgen\n" + +it "generates toc for 2 level heading correctly" +checkeq <(pdftocgen "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \ + "$SPEC/level2.toc" + +it "generates toc for one page headings correctly" +checkeq <(pdftocgen "$SPEC/onepage.pdf" < "$SPEC/onepage_greedy.toml") \ + "$SPEC/onepage.toc" + +it "generates toc for hard mode correctly" +checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml") \ + "$SPEC/hardmode.toc" + +it "generates readable toc" +checkeq <(pdftocgen -H "$SPEC/level2.pdf" < "$SPEC/level2_recipe.toml") \ + "$SPEC/level2_h.toc" + +printf "\npdftocio\n" + +tmpdir=$(mktemp -d) + +it "adds toc to pdf and prints toc correctly" +checkeq <(pdftocgen "$SPEC/hardmode.pdf" < "$SPEC/hardmode_recipe.toml" | \ + pdftocio -o "$tmpdir/out.pdf" "$SPEC/hardmode.pdf" && \ + pdftocio -p "$tmpdir/out.pdf") \ + "$SPEC/hardmode.toc" + +it "prints toc when -p is set" +checkeq <(pdftocio -p "$SPEC/hastoc.pdf" < $SPEC/level2.toc) \ + "$SPEC/hastoc.toc" + +it "prints toc vpos when -v is set" +checkeq <(pdftocio -p -v "$SPEC/hastoc.pdf") \ + "$SPEC/hastoc_v.toc" diff --git a/spec/files/Makefile b/spec/files/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bc48f892720067d4b6d07e849ce79366c9da0fc5 --- /dev/null +++ b/spec/files/Makefile @@ -0,0 +1,12 @@ +.PHONY: all clean + +all: level2.pdf hastoc.pdf onepage.pdf hardmode.pdf + +%.pdf: %.tex + latexmk -pdf $< + +clean: + rm -f *.aux *.dvi *.fdb_latexmk *.fls *.log *.out + +nuke: clean + rm -f *.pdf diff --git a/spec/files/hardmode.pdf b/spec/files/hardmode.pdf new file mode 100644 index 0000000000000000000000000000000000000000..aed1b467ef6b4926771892d1c2cc7ade6dd1813b --- /dev/null +++ b/spec/files/hardmode.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be6a1628292675b467b36a503c37ffa4d3073d2ff87d147dced3b3bff394875 +size 110985 diff --git a/spec/files/hardmode.tex b/spec/files/hardmode.tex new file mode 100644 index 0000000000000000000000000000000000000000..48242a7cd593df79ce1f1142fc8fb617bddd8e08 --- /dev/null +++ b/spec/files/hardmode.tex @@ -0,0 +1,68 @@ +\documentclass{article}[12pt] + +\usepackage{lipsum} +\usepackage{multicol} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage[USenglish]{babel} +\usepackage[stretch=10,shrink=10]{microtype} +\usepackage[left=1.3in, + right=1.3in, + top=1in, + bottom=1in, + footskip=.5in]{geometry} +\setlength{\columnsep}{0.4in} + +\renewcommand{\rmdefault}{zpltlf} +\usepackage{newpxtext} +% will mess up embeded symbols +% \usepackage{newpxmath} + +\title{The hard mode} +\author{krasjet} +\date{} + +\begin{document} +\begin{multicols}{2} +[ + \maketitle +] + +\section{Section One} + +\lipsum[2-3] + +\section{Section $1 + 1 = 2$} + +\lipsum[2-1] +\begin{align*} + x^2 + 2 = 4 +\end{align*} +\lipsum[2-1] + +\subsection{Subsection Two.One} +\lipsum[2-5] + +\section*{$\mathrm{e}^{\ln(3)}$} + +\setcounter{section}{3} +\setcounter{subsection}{0} + +\lipsum[1-2] + +\subsection{Subsection $\mathrm{e}^{\ln(3)}$.1, with looo\-ooooooooong title} +\lipsum[2-5] + +\subsection{$\mathbb{S}$ubsection Three.Two, another long title} +\lipsum[1-1] + +\subsection{Subsection Three.Three} +\lipsum[2-3] + +\section{The $x \to \infty$ End} + +\lipsum[2-2] + +\end{multicols} + +\end{document} diff --git a/spec/files/hardmode.toc b/spec/files/hardmode.toc new file mode 100644 index 0000000000000000000000000000000000000000..aa7398bc42cffb1f98290f1f1d73c5d243f8a342 --- /dev/null +++ b/spec/files/hardmode.toc @@ -0,0 +1,8 @@ +"1 Section One" 1 +"2 Section 1 + 1 = 2" 1 + "2.1 Subsection Two.One" 1 +"e ln(3)" 2 + "3.1 Subsection e ln(3) .1, with looo- ooooooooong title" 2 + "3.2 S ubsection Three.Two, another long title" 3 + "3.3 Subsection Three.Three" 3 +"4 The x → ∞ End" 3 diff --git a/spec/files/hardmode_recipe.toml b/spec/files/hardmode_recipe.toml new file mode 100644 index 0000000000000000000000000000000000000000..4f67312a7be3dd4dcb5de533a0f413b8b2536e7d --- /dev/null +++ b/spec/files/hardmode_recipe.toml @@ -0,0 +1,18 @@ +[[heading]] +level = 1 +greedy = true +font.name = "TeXGyrePagellaX-Bold" +font.size = 14.346199989318848 + +[[heading]] +level = 1 +greedy = true +font.name = "CMR10" +font.size = 9.962599754333496 +font.superscript = true + +[[heading]] +level = 2 +greedy = true +font.name = "TeXGyrePagellaX-Bold" +font.size = 11.9552001953125 diff --git a/spec/files/hastoc.pdf b/spec/files/hastoc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2e518d118710edac73ccf3d2c56faadc0aae8e3f Binary files /dev/null and b/spec/files/hastoc.pdf differ diff --git a/spec/files/hastoc.tex b/spec/files/hastoc.tex new file mode 100644 index 0000000000000000000000000000000000000000..1b1f62a47cd02343eeee77ca0549aed8fd6ff789 --- /dev/null +++ b/spec/files/hastoc.tex @@ -0,0 +1,42 @@ +\documentclass{article} + +\usepackage{lipsum} +\usepackage{hyperref} + +\title{2 Level Heading Test} +\author{krasjet} +\date{} + +\begin{document} +\maketitle + +\section{Section One} + +\lipsum[2-4] + +\section{Section Two} + +\lipsum[2-5] + +\subsection{Subsection Two.One} +\lipsum[2-5] + +\section{Section Three, with looong loooong looong title} + +\lipsum[1-2] + +\subsection{Subsection Three.One, with even loooooooooooonger title, and +probably even more} +\lipsum[2-5] + +\subsection{Subsection Three.Two} +\lipsum[1-1] + +\subsection{Subsection Three.Three} +\lipsum[2-3] + +\section{The End} + +\lipsum[2-5] + +\end{document} diff --git a/spec/files/hastoc.toc b/spec/files/hastoc.toc new file mode 100644 index 0000000000000000000000000000000000000000..b1916bbcb3700c1205acbe5e999b4b59b8b4a300 --- /dev/null +++ b/spec/files/hastoc.toc @@ -0,0 +1,8 @@ +"Section One" 1 +"Section Two" 1 + "Subsection Two.One" 2 +"Section Three, with looong loooong looong title" 3 + "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3 + "Subsection Three.Two" 4 + "Subsection Three.Three" 5 +"The End" 5 diff --git a/spec/files/hastoc_v.toc b/spec/files/hastoc_v.toc new file mode 100644 index 0000000000000000000000000000000000000000..5b82fb6f6cd404540ce48102ae32336e63d78e34 --- /dev/null +++ b/spec/files/hastoc_v.toc @@ -0,0 +1,8 @@ +"Section One" 1 234.65998 +"Section Two" 1 562.148 + "Subsection Two.One" 2 449.522 +"Section Three, with looong loooong looong title" 3 330.333 + "Subsection Three.One, with even loooooooooooonger title, and probably even more" 3 616.444 + "Subsection Three.Two" 4 509.298 + "Subsection Three.Three" 5 124.802 +"The End" 5 361.387 diff --git a/spec/files/level2.pdf b/spec/files/level2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1abfde38371e99a859bc831c240170470fade538 Binary files /dev/null and b/spec/files/level2.pdf differ diff --git a/spec/files/level2.tex b/spec/files/level2.tex new file mode 100644 index 0000000000000000000000000000000000000000..9457c2ab681ecff3e0b9ba4b18fd2341819108e3 --- /dev/null +++ b/spec/files/level2.tex @@ -0,0 +1,41 @@ +\documentclass{article} + +\usepackage{lipsum} + +\title{2 Level Heading Test} +\author{krasjet} +\date{} + +\begin{document} +\maketitle + +\section{Section One} + +\lipsum[2-4] + +\section{Section Two} + +\lipsum[2-5] + +\subsection{Subsection Two.One} +\lipsum[2-5] + +\section{Section Three, with looong loooong looong title} + +\lipsum[1-2] + +\subsection{Subsection Three.One, with even loooooooooooonger title, and +probably even more} +\lipsum[2-5] + +\subsection{Subsection Three.Two} +\lipsum[1-1] + +\subsection{Subsection Three.Three} +\lipsum[2-3] + +\section{The End} + +\lipsum[2-5] + +\end{document} diff --git a/spec/files/level2.toc b/spec/files/level2.toc new file mode 100644 index 0000000000000000000000000000000000000000..ba9e8df8f2a7a3c97d7386b2ac7ed359eb4ca8e7 --- /dev/null +++ b/spec/files/level2.toc @@ -0,0 +1,8 @@ +"1 Section One" 1 +"2 Section Two" 1 + "2.1 Subsection Two.One" 2 +"3 Section Three, with looong loooong looong ti- tle" 3 + "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 3 + "3.2 Subsection Three.Two" 4 + "3.3 Subsection Three.Three" 5 +"4 The End" 5 diff --git a/spec/files/level2_h.toc b/spec/files/level2_h.toc new file mode 100644 index 0000000000000000000000000000000000000000..cd65d7d2cdd2fb36a70a633ec42632d5fb0402a0 --- /dev/null +++ b/spec/files/level2_h.toc @@ -0,0 +1,8 @@ +1 Section One ··· 1 +2 Section Two ··· 1 + 2.1 Subsection Two.One ··· 2 +3 Section Three, with looong loooong looong ti- tle ··· 3 + 3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more ··· 3 + 3.2 Subsection Three.Two ··· 4 + 3.3 Subsection Three.Three ··· 5 +4 The End ··· 5 diff --git a/spec/files/level2_meta b/spec/files/level2_meta new file mode 100644 index 0000000000000000000000000000000000000000..488e41923de2458207a2fedcb56a1092498d9149 --- /dev/null +++ b/spec/files/level2_meta @@ -0,0 +1,26 @@ +Section One: + font.name = "CMBX12" + font.size = 14.346199989318848 + font.color = 0x000000 + font.superscript = false + font.italic = false + font.serif = true + font.monospace = false + font.bold = true + bbox.left = 157.98439025878906 + bbox.top = 237.6484375 + bbox.right = 243.12905883789062 + bbox.bottom = 252.00897216796875 +Section Two: + font.name = "CMBX12" + font.size = 14.346199989318848 + font.color = 0x000000 + font.superscript = false + font.italic = false + font.serif = true + font.monospace = false + font.bold = true + bbox.left = 157.98439025878906 + bbox.top = 567.3842163085938 + bbox.right = 245.18057250976562 + bbox.bottom = 581.7447509765625 diff --git a/spec/files/level2_meta.toml b/spec/files/level2_meta.toml new file mode 100644 index 0000000000000000000000000000000000000000..e7886a91f71667fecbdba97f62f5a5bfc605d710 --- /dev/null +++ b/spec/files/level2_meta.toml @@ -0,0 +1,38 @@ +[[heading]] +# Section One +level = 1 +greedy = true +font.name = "CMBX12" +font.size = 14.346199989318848 +# font.size_tolerance = 1e-5 +# font.color = 0x000000 +# font.superscript = false +# font.italic = false +# font.serif = true +# font.monospace = false +# font.bold = true +# bbox.left = 157.98439025878906 +# bbox.top = 237.6484375 +# bbox.right = 243.12905883789062 +# bbox.bottom = 252.00897216796875 +# bbox.tolerance = 1e-5 + +[[heading]] +# Section Two +level = 1 +greedy = true +font.name = "CMBX12" +font.size = 14.346199989318848 +# font.size_tolerance = 1e-5 +# font.color = 0x000000 +# font.superscript = false +# font.italic = false +# font.serif = true +# font.monospace = false +# font.bold = true +# bbox.left = 157.98439025878906 +# bbox.top = 567.3842163085938 +# bbox.right = 245.18057250976562 +# bbox.bottom = 581.7447509765625 +# bbox.tolerance = 1e-5 + diff --git a/spec/files/level2_recipe.toml b/spec/files/level2_recipe.toml new file mode 100644 index 0000000000000000000000000000000000000000..9a69ff7864e4bc658fd44bff50f01735337ff17a --- /dev/null +++ b/spec/files/level2_recipe.toml @@ -0,0 +1,9 @@ +[[heading]] +level = 1 +font.name = "CMBX12" +font.size = 14.346199989318848 + +[[heading]] +level = 2 +font.name = "CMBX12" +font.size = 11.9552001953125 diff --git a/spec/files/onepage.pdf b/spec/files/onepage.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4e4bfb3d1a90cf9d35cee459af93cb5677307eb5 Binary files /dev/null and b/spec/files/onepage.pdf differ diff --git a/spec/files/onepage.tex b/spec/files/onepage.tex new file mode 100644 index 0000000000000000000000000000000000000000..224b3e62a53af1ea1f4148c88e96a15dfd3d2b5b --- /dev/null +++ b/spec/files/onepage.tex @@ -0,0 +1,37 @@ +\documentclass{article} + +\usepackage{lipsum} + +\title{One page Test} +\author{krasjet} +\date{} + +\begin{document} +\maketitle + +\section{Section One} + +\section{Section Two} + +\subsection{Subsection Two.One} +\subsection{Subsection Two.Two $\times 2$} + +\section{Section Three, with looong loooong looong title} + +\subsection{Subsection Three.One, with even loooooooooooonger title, and +probably even more} + +\subsection{Subsection Three.Two} + +\subsection{Subsection Three.Three} +\subsubsection{Subsubsection Three.Three.One} +\subsubsection{Subsubsection Three.Three.Two} +\subsubsection{Subsubsection Three.Three.Three} + +\subsection{Subsection Three.Four} + +\subsection{Subsection Three.Five} + +\section{The End} + +\end{document} diff --git a/spec/files/onepage.toc b/spec/files/onepage.toc new file mode 100644 index 0000000000000000000000000000000000000000..cab90366a9af8cc9ee4368806f84ead29dd76b19 --- /dev/null +++ b/spec/files/onepage.toc @@ -0,0 +1,14 @@ +"1 Section One" 1 +"2 Section Two" 1 + "2.1 Subsection Two.One" 1 + "2.2 Subsection Two.Two × 2" 1 +"3 Section Three, with looong loooong looong ti- tle" 1 + "3.1 Subsection Three.One, with even loooooooooooonger title, and probably even more" 1 + "3.2 Subsection Three.Two" 1 + "3.3 Subsection Three.Three" 1 + "3.3.1 Subsubsection Three.Three.One" 1 + "3.3.2 Subsubsection Three.Three.Two" 1 + "3.3.3 Subsubsection Three.Three.Three" 1 + "3.4 Subsection Three.Four" 1 + "3.5 Subsection Three.Five" 1 +"4 The End" 1 diff --git a/spec/files/onepage_greedy.toml b/spec/files/onepage_greedy.toml new file mode 100644 index 0000000000000000000000000000000000000000..ff645f2f8ab30042c52e9f90c5b1e9189ff5c2d3 --- /dev/null +++ b/spec/files/onepage_greedy.toml @@ -0,0 +1,15 @@ +[[heading]] +level = 1 +font.name = "CMBX12" +font.size = 14.346199989318848 + +[[heading]] +level = 2 +greedy = true +font.name = "CMBX12" +font.size = 11.9552001953125 + +[[heading]] +level = 3 +font.name = "CMBX10" +font.size = 9.962599754333496 diff --git a/spec/files/onepage_recipe.toml b/spec/files/onepage_recipe.toml new file mode 100644 index 0000000000000000000000000000000000000000..1602f87bf67e0667012e11fe6f49e15f20e1ae2e --- /dev/null +++ b/spec/files/onepage_recipe.toml @@ -0,0 +1,14 @@ +[[heading]] +level = 1 +font.name = "CMBX12" +font.size = 14.346199989318848 + +[[heading]] +level = 2 +font.name = "(CMBX12|CMSY10|CMR12)" +font.size = 11.9552001953125 + +[[heading]] +level = 3 +font.name = "CMBX10" +font.size = 9.962599754333496 diff --git a/spec/files/recipe_spec.toml b/spec/files/recipe_spec.toml new file mode 100644 index 0000000000000000000000000000000000000000..fc9bbe76e2bbd0ac0f9780ed5f0c79011f213c1e --- /dev/null +++ b/spec/files/recipe_spec.toml @@ -0,0 +1,33 @@ +[[heading]] +level = 1 +font.name = "CMBX12" +font.size = 14.346199989318848 +font.size_tolerance = 1e-5 +font.color = 0x000000 +font.superscript = false +font.italic = false +font.serif = true +font.monospace = false +font.bold = true +bbox.left = 157.98439025878906 +bbox.top = 335.569580078125 +bbox.right = 477.66058349609375 +bbox.bottom = 349.93011474609375 +bbox.tolerance = 1e-5 + +[[heading]] +level = 2 +font.name = "CMBX10" +font.size = 9.962599754333496 +font.size_tolerance = 1e-5 +font.color = 0x000000 +font.superscript = false +font.italic = false +font.serif = true +font.monospace = false +font.bold = true +bbox.left = 168.76663208007812 +bbox.top = 127.2930679321289 +bbox.right = 280.66656494140625 +bbox.bottom = 137.2556610107422 +bbox.tolerance = 1e-5 diff --git a/spec/filter_spec.py b/spec/filter_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..ac02003c36bccd9b573e487f0915fb7383017df4 --- /dev/null +++ b/spec/filter_spec.py @@ -0,0 +1,642 @@ +import os + +from mamba import description, it, before +from pdftocgen.filter import ( + ToCFilter, + admits_float, + FontFilter, + BoundingBoxFilter +) + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +with description("admits_float") as self: + with it("admits if difference is below tol"): + assert admits_float(1, 1.05, 0.1) + assert admits_float(1, 0.95, 0.1) + + with it("does not admit if difference is too large"): + assert not admits_float(1, 1.5, 0.1) + assert not admits_float(1, 0.5, 0.1) + + with it("admits anything if expect is unset"): + assert admits_float(None, 1, 0.1) + assert admits_float(None, None, 0.1) + + with it("does not admit if expect is set but actual is None"): + assert not admits_float(1, None, 0.1) + +with description("ToCFilter") as self: + with before.all: + self.title_exact = { + 'level': 1, + 'font': { + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }, + 'bbox': { + 'left': 157.98439025878906, + 'top': 567.3842163085938, + 'right': 245.18057250976562, + 'bottom': 581.7447509765625, + 'tolerance': 0 + } + } + + self.text_exact = { + 'level': 2, + 'font': { + 'name': "CMR10", + 'size': 9.962599754333496, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': False + }, + 'bbox': { + 'left': 133.76800537109375, + 'top': 592.492919921875, + 'right': 477.537353515625, + 'bottom': 602.4555053710938, + 'tolerance': 0 + } + } + + self.spn_title = { + 'size': 14.346199989318848, + 'flags': 20, + 'font': 'TZOLRB+CMBX12', + 'color': 0, + 'text': 'Section Two', + 'bbox': (157.98439025878906, + 567.3842163085938, + 245.18057250976562, + 581.7447509765625) + } + + self.spn_text = { + 'size': 9.962599754333496, + 'flags': 4, + 'font': 'MJDLZY+CMR10', + 'color': 0, + 'text': 'text', + 'bbox': (133.76800537109375, + 592.492919921875, + 477.537353515625, + 602.4555053710938) + } + + with it("raises error if no toc level is specified"): + try: + fltr = ToCFilter({}) + except ValueError: + pass + except: + assert False, "must raise error" + + with it("raises error if toc level is invalid"): + try: + fltr = ToCFilter({'level': 0}) + fltr = ToCFilter({'level': -1}) + except ValueError: + pass + except: + assert False, "must raise error" + + with it("does not raise error if toc level is valid"): + try: + fltr = ToCFilter({'level': 1}) + fltr = ToCFilter({'level': 2}) + except ValueError: + assert False, "must not raise error" + + with it("admits exact matches"): + filter_title = ToCFilter(self.title_exact) + filter_text = ToCFilter(self.text_exact) + assert filter_title.admits(self.spn_title) + assert filter_text.admits(self.spn_text) + + with it("rejects unmatched spans"): + filter_title = ToCFilter(self.title_exact) + filter_text = ToCFilter(self.text_exact) + assert not filter_title.admits(self.spn_text) + assert not filter_text.admits(self.spn_title) + + with it("admits correctly without bbox"): + filter_title = ToCFilter({ + 'level': 1, + 'font': { + 'name': "CMBX12", + } + }) + assert filter_title.admits(self.spn_title) + + filter_text = ToCFilter({ + 'level': 2, + 'font': { + 'size': 9.962599754333496, + } + }) + assert filter_text.admits(self.spn_text) + + with it("rejects correctly without bbox"): + filter_title = ToCFilter({ + 'level': 1, + 'font': { + 'name': "CMBX12", + } + }) + assert not filter_title.admits(self.spn_text) + + filter_text = ToCFilter({ + 'level': 2, + 'font': { + 'size': 9.962599754333496, + } + }) + assert not filter_text.admits(self.spn_title) + + with it("admits correctly without font"): + filter_title = ToCFilter({ + 'level': 1, + 'bbox': { + 'left': 157.98439025878906, + } + }) + assert filter_title.admits(self.spn_title) + + filter_text = ToCFilter({ + 'level': 2, + 'bbox': { + 'top': 592.492919921875, + } + }) + assert filter_text.admits(self.spn_text) + + with it("rejects correctly without font"): + filter_title = ToCFilter({ + 'level': 1, + 'bbox': { + 'left': 157.98439025878906, + } + }) + assert not filter_title.admits(self.spn_text) + + filter_text = ToCFilter({ + 'level': 2, + 'bbox': { + 'top': 592.492919921875, + } + }) + assert not filter_text.admits(self.spn_title) + + +with description("FontFilter") as self: + with before.all: + self.title_exact = { + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + } + + self.text_exact = { + 'name': "CMR10", + 'size': 9.962599754333496, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': False + } + + self.spn_title = { + 'size': 14.346199989318848, + 'flags': 20, + 'font': 'TZOLRB+CMBX12', + 'color': 0, + 'text': 'Section Two', + 'bbox': (157.98439025878906, + 567.3842163085938, + 245.18057250976562, + 581.7447509765625) + } + + self.spn_small_title = { + 'size': 9.962599754333496, + 'flags': 4, + 'font': 'TZOLRB+CMBX12', + 'color': 0, + 'text': 'text', + 'bbox': (133.76800537109375, + 592.492919921875, + 477.537353515625, + 602.4555053710938) + } + + self.spn_text = { + 'size': 9.962599754333496, + 'flags': 4, + 'font': 'MJDLZY+CMR10', + 'color': 0, + 'text': 'text', + 'bbox': (133.76800537109375, + 592.492919921875, + 477.537353515625, + 602.4555053710938) + } + + with it("has a working constructor"): + fnt = FontFilter(self.title_exact) + assert fnt.name.search("TZOLRB+CMBX12") + assert fnt.name.search("CMBX12") + assert not fnt.name.search("CMBX10") + assert fnt.flags == 0b10100 + assert fnt.ign_mask == 0b11111 + assert fnt.color == 0x000000 + assert fnt.size == 14.346199989318848 + assert fnt.size_tolerance == 0 + + with it("can construct if empty dict is given in the constructor"): + fnt = FontFilter({}) + assert fnt.name.search("anything") + assert fnt.flags == 0 + assert fnt.ign_mask == 0 + assert fnt.color is None + assert fnt.size is None + assert fnt.size_tolerance == 1e-5 + + with it("admits exact matches"): + fnt_title = FontFilter(self.title_exact) + fnt_text = FontFilter(self.text_exact) + assert fnt_title.admits(self.spn_title) + assert fnt_text.admits(self.spn_text) + + with it("rejects unmatched spans"): + fnt_title = FontFilter(self.title_exact) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + fnt_text = FontFilter(self.text_exact) + assert not fnt_text.admits(self.spn_title) + assert not fnt_text.admits(self.spn_small_title) + + with it("admits correctly without font name"): + fnt_title = FontFilter({ + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly without font name"): + fnt_title = FontFilter({ + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly with only font name"): + fnt_title = FontFilter({ + 'name': "CMBX12" + }) + assert fnt_title.admits(self.spn_title) + assert fnt_title.admits(self.spn_small_title) + + with it("rejects correctly with only font name"): + fnt_title = FontFilter({ + 'name': "CMBX12" + }) + assert not fnt_title.admits(self.spn_text) + + with it("admits correctly without size"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly without size"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size_tolerance': 0, + 'color': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly with only size"): + fnt_title = FontFilter({ + 'size': 14.346199989318848, + 'size_tolerance': 0 + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly with only size"): + fnt_title = FontFilter({ + 'size': 14.346199989318848, + 'size_tolerance': 0 + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly without color"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly without color"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly with only color"): + fnt_title = FontFilter({ + 'color': 0x000000, + }) + assert fnt_title.admits(self.spn_title) + assert fnt_title.admits(self.spn_text) + assert fnt_title.admits(self.spn_small_title) + + with it("rejects correctly with only color"): + fnt_title = FontFilter({ + 'color': 0x000000, + }) + spn_blue = { + 'size': 14.346199989318848, + 'flags': 20, + 'font': 'TZOLRB+CMBX12', + 'color': 0x0000ff, + 'text': 'Section Two', + 'bbox': (157.98439025878906, + 567.3842163085938, + 245.18057250976562, + 581.7447509765625) + } + assert not fnt_title.admits(spn_blue) + + with it("admits correctly with only flags"): + fnt_title = FontFilter({ + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly with only flags"): + fnt_title = FontFilter({ + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly without flags"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + }) + assert fnt_title.admits(self.spn_title) + + with it("rejects correctly without flags"): + fnt_title = FontFilter({ + 'name': "CMBX12", + 'size': 14.346199989318848, + 'size_tolerance': 0, + 'color': 0, + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + + with it("admits correctly with partial flags"): + fnt_title = FontFilter({ + 'serif': True, + 'bold': True + }) + fnt_serif = FontFilter({ + 'serif': True + }) + fnt_sans = FontFilter({ + 'serif': False + }) + fnt_mono = FontFilter({ + 'monospace': True + }) + assert fnt_title.admits(self.spn_title) + assert fnt_serif.admits(self.spn_title) + assert fnt_serif.admits(self.spn_text) + assert fnt_sans.admits({'flags': 0b11011}) + assert fnt_mono.admits({'flags': 0b11111}) + + with it("rejects correctly with partial flags"): + fnt_title = FontFilter({ + 'serif': True, + 'bold': True + }) + fnt_serif = FontFilter({ + 'serif': True + }) + fnt_sans = FontFilter({ + 'serif': False + }) + fnt_mono = FontFilter({ + 'monospace': True + }) + assert not fnt_title.admits(self.spn_text) + assert not fnt_title.admits(self.spn_small_title) + assert not fnt_sans.admits(self.spn_title) + assert not fnt_sans.admits(self.spn_text) + assert not fnt_mono.admits(self.spn_title) + assert not fnt_mono.admits(self.spn_text) + + +with description("BoundingBoxFilter") as self: + with before.all: + self.title_exact = { + 'left': 157.98439025878906, + 'top': 567.3842163085938, + 'right': 245.18057250976562, + 'bottom': 581.7447509765625, + 'tolerance': 0 + } + + self.text_exact = { + 'left': 133.76800537109375, + 'top': 592.492919921875, + 'right': 477.537353515625, + 'bottom': 602.4555053710938, + 'tolerance': 0 + } + + self.spn_title = { + 'size': 14.346199989318848, + 'flags': 20, + 'font': 'TZOLRB+CMBX12', + 'color': 0, + 'text': 'Section Two', + 'bbox': (157.98439025878906, + 567.3842163085938, + 245.18057250976562, + 581.7447509765625) + } + + self.spn_title2 = { + 'size': 14.346199989318848, + 'flags': 20, + 'font': 'TZOLRB+CMBX12', + 'color': 0, + 'text': 'Section One', + 'bbox': (157.98439025878906, + 335.569580078125, + 477.66058349609375, + 349.93011474609375) + } + + self.spn_text = { + 'size': 9.962599754333496, + 'flags': 4, + 'font': 'MJDLZY+CMR10', + 'color': 0, + 'text': 'text', + 'bbox': (133.76800537109375, + 592.492919921875, + 477.537353515625, + 602.4555053710938) + } + with it("has a working constructor"): + bbox = BoundingBoxFilter(self.title_exact) + assert bbox.left is not None + assert bbox.right is not None + assert bbox.top is not None + assert bbox.bottom is not None + assert bbox.tolerance == 0 + + with it("can construct if empty dict is given in the constructor"): + bbox = BoundingBoxFilter({}) + assert bbox.left is None + assert bbox.right is None + assert bbox.top is None + assert bbox.bottom is None + assert bbox.tolerance == 1e-5 + + with it("admits exact matches"): + bbox_title = BoundingBoxFilter(self.title_exact) + bbox_text = BoundingBoxFilter(self.text_exact) + assert bbox_title.admits(self.spn_title) + assert bbox_text.admits(self.spn_text) + + with it("rejects unmatched spans"): + bbox_title = BoundingBoxFilter(self.title_exact) + assert not bbox_title.admits(self.spn_text) + assert not bbox_title.admits(self.spn_title2) + + bbox_text = BoundingBoxFilter(self.text_exact) + assert not bbox_text.admits(self.spn_title) + assert not bbox_text.admits(self.spn_title2) + + with it("admits correctly with partial bbox"): + bbox_title = BoundingBoxFilter({ + 'left': 157.98439025878906 + }) + assert bbox_title.admits(self.spn_title) + assert bbox_title.admits(self.spn_title2) + + bbox_top = BoundingBoxFilter({ + 'top': 567.3842163085938 + }) + assert bbox_top.admits(self.spn_title) + + bbox_right = BoundingBoxFilter({ + 'right': 245.18057250976562 + }) + assert bbox_right.admits(self.spn_title) + + bbox_bottom = BoundingBoxFilter({ + 'bottom': 581.7447509765625 + }) + assert bbox_bottom.admits(self.spn_title) + + with it("rejects correctly with partial bbox"): + bbox_title = BoundingBoxFilter({ + 'left': 157.98439025878906 + }) + assert not bbox_title.admits(self.spn_text) + + bbox_top = BoundingBoxFilter({ + 'top': 567.3842163085938 + }) + assert not bbox_top.admits(self.spn_title2) + + bbox_right = BoundingBoxFilter({ + 'right': 245.18057250976562 + }) + assert not bbox_right.admits(self.spn_title2) + + bbox_bottom = BoundingBoxFilter({ + 'bottom': 581.7447509765625 + }) + assert not bbox_bottom.admits(self.spn_title2) diff --git a/spec/fitzutils_spec.py b/spec/fitzutils_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..2fb271c74bacbb6cc3d062cebcdafd3fe6671d28 --- /dev/null +++ b/spec/fitzutils_spec.py @@ -0,0 +1,101 @@ +import os +import io + +from mamba import description, it, before +from fitzutils import ( + open_pdf, + ToCEntry, + dump_toc +) +from pdftocio.tocparser import parse_toc + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +valid_file = os.path.join(dirpath, "files/level2.pdf") +invalid_file = os.path.join(dirpath, "files/nothing.pdf") + +with description("open_pdf:") as self: + with it("opens pdf file for reading"): + with open_pdf(valid_file, False) as doc: + assert doc is not None + assert doc.page_count == 6 + + with it("returns None if pdf file is invalid"): + with open_pdf(invalid_file, False) as doc: + assert doc is None + + with it("exits if pdf file is invalid and exit_on_error is true"): + try: + with open_pdf(invalid_file, True) as doc: + assert False, "should have exited" + except AssertionError as err: + raise err + except: + pass + +with description("ToCEntry") as self: + with it("matches fitz's representation"): + fitz_entry = [1, "title", 2] + fitz_entry2 = [1, "title", 2, 100.0] + + toc_entry = ToCEntry(level=1, title="title", pagenum=2) + toc_entry2 = ToCEntry(level=1, title="title", pagenum=2, vpos=100.0) + + assert toc_entry.to_fitz_entry() == fitz_entry + assert toc_entry2.to_fitz_entry() == fitz_entry2 + + assert ToCEntry(*fitz_entry) == toc_entry + assert ToCEntry(*fitz_entry2) == toc_entry2 + + with it("is sorted correctly"): + entries = [ + ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0), + ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0), + ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0), + ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0), + ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0), + ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0) + ] + + expected = [ + ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0), + ToCEntry(level=1, title="title2", pagenum=1, vpos=150.0), + ToCEntry(level=1, title="title3", pagenum=2, vpos=90.0), + ToCEntry(level=1, title="title4", pagenum=2, vpos=150.0), + ToCEntry(level=1, title="title5", pagenum=3, vpos=0.0), + ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0) + ] + assert sorted(entries, key=ToCEntry.key) == expected + + +with description("dump_toc") as self: + with before.all: + self.toc = [ + ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0), + ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0), + ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0), + ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0), + ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0), + ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0) + ] + + self.toc_novpos = [ + ToCEntry(level=1, title="title1", pagenum=1), + ToCEntry(level=2, title="title2", pagenum=1), + ToCEntry(level=3, title="title3", pagenum=2), + ToCEntry(level=2, title="title4", pagenum=2), + ToCEntry(level=2, title="title5", pagenum=3), + ToCEntry(level=1, title="title6", pagenum=5) + ] + + with it("won't print vpos if vpos is False"): + toc_s = dump_toc(self.toc, False) + f = io.StringIO(toc_s) + assert parse_toc(f) == self.toc_novpos + assert parse_toc(f) != self.toc + + with it("won't print vpos if vpos is missing"): + toc_s = dump_toc(self.toc_novpos, True) + f = io.StringIO(toc_s) + assert parse_toc(f) == self.toc_novpos + assert parse_toc(f) != self.toc diff --git a/spec/parser_spec.py b/spec/parser_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..548998adef037079f0b0a3044725985d00482be7 --- /dev/null +++ b/spec/parser_spec.py @@ -0,0 +1,65 @@ +import os +import io + +from mamba import description, it, before +from fitzutils import ( + dump_toc, + ToCEntry +) +from pdftocio.tocparser import parse_toc + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +valid_file = os.path.join(dirpath, "files/level2.pdf") +invalid_file = os.path.join(dirpath, "files/nothing.pdf") + +with description("parse_toc") as self: + with before.all: + self.toc = [ + ToCEntry(level=1, title="title1", pagenum=1, vpos=100.0), + ToCEntry(level=2, title="title2", pagenum=1, vpos=150.0), + ToCEntry(level=3, title="title3", pagenum=2, vpos=90.0), + ToCEntry(level=2, title="title4", pagenum=2, vpos=150.0), + ToCEntry(level=2, title="title5", pagenum=3, vpos=0.0), + ToCEntry(level=1, title="title6", pagenum=5, vpos=200.0) + ] + + self.toc_novpos = [ + ToCEntry(level=1, title="title1", pagenum=1), + ToCEntry(level=2, title="title2", pagenum=1), + ToCEntry(level=3, title="title3", pagenum=2), + ToCEntry(level=2, title="title4", pagenum=2), + ToCEntry(level=2, title="title5", pagenum=3), + ToCEntry(level=1, title="title6", pagenum=5) + ] + + + with it("can recover the result from dump_toc"): + toc_s = dump_toc(self.toc, True) + f = io.StringIO(toc_s) + assert parse_toc(f) == self.toc + assert parse_toc(f) != self.toc_novpos + + toc_s = dump_toc(self.toc_novpos, False) + f = io.StringIO(toc_s) + assert parse_toc(f) == self.toc_novpos + assert parse_toc(f) != self.toc + + with it("escapes quotations correctly"): + quoted = '"a ""quoted"" title" 2\n "a single \'quoted\' title" 4' + expect = [ + ToCEntry(level=1, title='a "quoted" title', pagenum=2), + ToCEntry(level=2, title="a single 'quoted' title", pagenum=4) + ] + f = io.StringIO(quoted) + assert parse_toc(f) == expect + + with it("raises error when toc entry is invalid"): + malformed = '"entry" 1\n "error entry"' + f = io.StringIO(malformed) + try: + parse_toc(f) + except IndexError: + pass + else: + assert False, "must raise error" diff --git a/spec/tocgen_spec.py b/spec/tocgen_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..aade8b67f47d66d0bf3913e473adca9a1702ac8c --- /dev/null +++ b/spec/tocgen_spec.py @@ -0,0 +1,159 @@ +import os +import fitz +import toml + +from mamba import description, it, before +from fitzutils import ToCEntry +from pdftocgen.tocgen import gen_toc + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +with description("gen_toc") as self: + with before.all: + self.level2 = fitz.open(os.path.join(dirpath, "files/level2.pdf")) + self.level2_recipe = toml.load( + open(os.path.join(dirpath, "files/level2_recipe.toml")) + ) + self.level2_expect = [ + ToCEntry(level=1, title='1 Section One', + pagenum=1, vpos=237.6484375), + ToCEntry(level=1, title='2 Section Two', + pagenum=1, vpos=567.3842163085938), + ToCEntry(level=2, title='2.1 Subsection Two.One', + pagenum=2, vpos=452.56671142578125), + ToCEntry(level=1, + title='3 Section Three, with looong loooong looong ti- tle', + pagenum=3, vpos=335.569580078125), + ToCEntry(level=2, title='3.1 Subsection Three.One, ' + 'with even loooooooooooonger title, and probably even more', + pagenum=3, vpos=619.4886474609375), + ToCEntry(level=2, title='3.2 Subsection Three.Two', + pagenum=4, vpos=512.3426513671875), + ToCEntry(level=2, title='3.3 Subsection Three.Three', + pagenum=5, vpos=125.79861450195312), + ToCEntry(level=1, title='4 The End', + pagenum=5, vpos=366.62347412109375) + ] + + self.onepage = fitz.open(os.path.join(dirpath, "files/onepage.pdf")) + self.onepage_recipe = toml.load( + open(os.path.join(dirpath, "files/onepage_recipe.toml")) + ) + self.onepage_greedy = toml.load( + open(os.path.join(dirpath, "files/onepage_greedy.toml")) + ) + self.onepage_expect = [ + # false positive, but easy to remove in post-processing + ToCEntry(level=2, title='krasjet', + pagenum=1, vpos=196.53366088867188), + ToCEntry(level=1, title='1 Section One', + pagenum=1, vpos=237.6484375), + ToCEntry(level=1, title='2 Section Two', + pagenum=1, vpos=265.44744873046875), + ToCEntry(level=2, title='2.1 Subsection Two.One', + pagenum=1, vpos=291.0536804199219), + ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2', + pagenum=1, vpos=311.1368103027344), + ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle', + pagenum=1, vpos=334.00946044921875), + ToCEntry(level=2, title='3.1 Subsection Three.One, ' + 'with even loooooooooooonger title, and probably even more', + pagenum=1, vpos=377.5487060546875), + ToCEntry(level=2, title='3.2 Subsection Three.Two', + pagenum=1, vpos=411.8786926269531), + ToCEntry(level=2, title='3.3 Subsection Three.Three', + pagenum=1, vpos=432.26068115234375), + ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One', + pagenum=1, vpos=452.1441345214844), + ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two', + pagenum=1, vpos=470.53314208984375), + ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three', + pagenum=1, vpos=488.9231262207031), + ToCEntry(level=2, title='3.4 Subsection Three.Four', + pagenum=1, vpos=507.8106994628906), + ToCEntry(level=2, title='3.5 Subsection Three.Five', + pagenum=1, vpos=528.191650390625), + ToCEntry(level=1, title='4 The End', + pagenum=1, vpos=550.7654418945312) + ] + + self.onepage_greedy_expect = [ + # hooray, no more false positives + ToCEntry(level=1, title='1 Section One', + pagenum=1, vpos=237.6484375), + ToCEntry(level=1, title='2 Section Two', + pagenum=1, vpos=265.44744873046875), + ToCEntry(level=2, title='2.1 Subsection Two.One', + pagenum=1, vpos=291.0536804199219), + ToCEntry(level=2, title='2.2 Subsection Two.Two \xd7 2', + pagenum=1, vpos=311.1368103027344), + ToCEntry(level=1, title='3 Section Three, with looong loooong looong ti- tle', + pagenum=1, vpos=334.00946044921875), + ToCEntry(level=2, title='3.1 Subsection Three.One, ' + 'with even loooooooooooonger title, and probably even more', + pagenum=1, vpos=377.5487060546875), + ToCEntry(level=2, title='3.2 Subsection Three.Two', + pagenum=1, vpos=411.8786926269531), + ToCEntry(level=2, title='3.3 Subsection Three.Three', + pagenum=1, vpos=432.26068115234375), + ToCEntry(level=3, title='3.3.1 Subsubsection Three.Three.One', + pagenum=1, vpos=452.1441345214844), + ToCEntry(level=3, title='3.3.2 Subsubsection Three.Three.Two', + pagenum=1, vpos=470.53314208984375), + ToCEntry(level=3, title='3.3.3 Subsubsection Three.Three.Three', + pagenum=1, vpos=488.9231262207031), + ToCEntry(level=2, title='3.4 Subsection Three.Four', + pagenum=1, vpos=507.8106994628906), + ToCEntry(level=2, title='3.5 Subsection Three.Five', + pagenum=1, vpos=528.191650390625), + ToCEntry(level=1, title='4 The End', + pagenum=1, vpos=550.7654418945312) + ] + + self.hardmode = fitz.open(os.path.join(dirpath, "files/hardmode.pdf")) + self.hardmode_recipe = toml.load( + open(os.path.join(dirpath, "files/hardmode_recipe.toml")) + ) + + self.hardmode_expect = [ + ToCEntry(level=1, title='1 Section One', + pagenum=1, vpos=174.1232452392578), + ToCEntry(level=1, title='2 Section 1 + 1 = 2', + pagenum=1, vpos=584.5831909179688), + ToCEntry(level=2, title='2.1 Subsection Two.One', + pagenum=1, vpos=425.2061462402344), + ToCEntry(level=1, title='e ln(3)', + pagenum=2, vpos=516.01708984375), + ToCEntry(level=2, title='3.1 Subsection e ln(3) .1, ' + 'with looo- ooooooooong title', + pagenum=2, vpos=302.5021057128906), + ToCEntry(level=2, title='3.2 S ubsection Three.Two, another long title', + pagenum=3, vpos=396.212158203125), + ToCEntry(level=2, title='3.3 Subsection Three.Three', + pagenum=3, vpos=68.84815979003906), + ToCEntry(level=1, title='4 The x → ∞ End', + pagenum=3, vpos=483.49920654296875) + ] + + with it("generates 2-level toc correctly"): + assert gen_toc(self.level2, self.level2_recipe) == self.level2_expect + + with it("handles headings on same page correctly"): + assert gen_toc( + self.onepage, self.onepage_recipe + ) == self.onepage_expect + + with it("handles math in heading correctly"): + assert gen_toc( + self.onepage, self.onepage_recipe + ) == self.onepage_expect + + with it("handles greedy filter correctly"): + assert gen_toc( + self.onepage, self.onepage_greedy + ) == self.onepage_greedy_expect + + with it("passes the HARD MODE"): + assert gen_toc( + self.hardmode, self.hardmode_recipe + ) == self.hardmode_expect diff --git a/spec/tocio_spec.py b/spec/tocio_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..4b4fff163c0a3e8f49f3832ef1c2e588d57b5120 --- /dev/null +++ b/spec/tocio_spec.py @@ -0,0 +1,81 @@ +import os +import fitz + +from mamba import description, it, before +from fitzutils import ToCEntry +from pdftocio.tocio import read_toc, write_toc + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +level2 = os.path.join(dirpath, "files/level2.pdf") +hastoc = os.path.join(dirpath, "files/hastoc.pdf") + +with description("read_toc") as self: + with before.all: + self.doc = fitz.open(level2) + self.reference = fitz.open(hastoc) + self.expect = [ + ToCEntry(level=1, title='Section One', pagenum=1, vpos=234.65998), + ToCEntry(level=1, title='Section Two', pagenum=1, vpos=562.148), + ToCEntry(level=2, title='Subsection Two.One', pagenum=2, vpos=449.522), + ToCEntry(level=1, + title='Section Three, with looong loooong looong title', + pagenum=3, + vpos=330.333), + ToCEntry(level=2, + title='Subsection Three.One, ' + 'with even loooooooooooonger title, and probably even more', + pagenum=3, + vpos=616.444), + ToCEntry(level=2, title='Subsection Three.Two', + pagenum=4, vpos=509.298), + ToCEntry(level=2, title='Subsection Three.Three', + pagenum=5, vpos=124.802), + ToCEntry(level=1, title='The End', pagenum=5, vpos=361.387) + ] + + with it("reads pdf toc correctly"): + assert self.expect == read_toc(self.reference) + + with it("makes (read_toc -> write_toc -> read_toc) an identity operation (except vpos)"): + toc = read_toc(self.reference) + write_toc(self.doc, toc) + toc2 = read_toc(self.doc) + + assert len(toc2) == len(toc) + for e1, e2 in zip(toc, toc2): + assert e1.level == e2.level + assert e1.title == e2.title + assert e1.pagenum == e2.pagenum + +with description("write_toc") as self: + with before.all: + self.doc = fitz.open(level2) + self.reference = fitz.open(hastoc) + self.toc = [ + ToCEntry(level=1, title='Section One', pagenum=1), + ToCEntry(level=1, title='Section Two', pagenum=1), + ToCEntry(level=2, title='Subsection Two.One', pagenum=2), + ToCEntry(level=1, + title='Section Three, with looong loooong looong title', + pagenum=3), + ToCEntry(level=2, + title='Subsection Three.One, ' + 'with even loooooooooooonger title, and probably even more', + pagenum=3), + ToCEntry(level=2, title='Subsection Three.Two', + pagenum=4), + ToCEntry(level=2, title='Subsection Three.Three', + pagenum=5), + ToCEntry(level=1, title='The End', pagenum=5) + ] + + with it("makes (write_toc -> read_toc) an identity operation (except vpos)"): + write_toc(self.doc, self.toc) + toc2 = read_toc(self.doc) + + assert len(toc2) == len(self.toc) + for e1, e2 in zip(self.toc, toc2): + assert e1.level == e2.level + assert e1.title == e2.title + assert e1.pagenum == e2.pagenum diff --git a/spec/xmeta_spec.py b/spec/xmeta_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..e1db81351c40a86ae8c5060dbf47177cd96d6f97 --- /dev/null +++ b/spec/xmeta_spec.py @@ -0,0 +1,188 @@ +import os +import fitz +import toml + +from mamba import description, it, before +from pdfxmeta import extract_meta, dump_meta, dump_toml + +dirpath = os.path.dirname(os.path.abspath(__file__)) + +with description("extract_meta:") as self: + with before.all: + self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) + + with it("extracts metadata from pdf"): + meta = extract_meta(self.doc, "Section One", 1) + assert len(meta) == 1 + + m = meta[0] + assert m['text'] == "Section One" + assert 'font' in m + assert 'CMBX12' in m['font'] + + with it("matches lowercase when ignore case is set"): + meta = extract_meta(self.doc, "section one", 1, True) + assert len(meta) == 1 + + m = meta[0] + assert m['text'] == "Section One" + assert 'font' in m + assert 'CMBX12' in m['font'] + + with it("matches mixed case when ignore case is set"): + meta = extract_meta(self.doc, "sEcTIoN OnE", 1, True) + assert len(meta) == 1 + + m = meta[0] + assert m['text'] == "Section One" + assert 'font' in m + assert 'CMBX12' in m['font'] + + with it("matches nothing if ignore case is not set"): + meta = extract_meta(self.doc, "section one", 1, False) + assert len(meta) == 0 + + with it("can match multiple instances of needle"): + meta = extract_meta(self.doc, "Section", 1) + assert len(meta) == 2 + + m = meta[0] + assert m['text'] == "Section One" + assert 'font' in m + assert 'CMBX12' in m['font'] + + m = meta[1] + assert m['text'] == "Section Two" + assert 'font' in m + assert 'CMBX12' in m['font'] + + with it("returns [] when nothing is matched"): + meta = extract_meta(self.doc, "Sectoin", 1, False) + assert len(meta) == 0 + + with it("returns [] when page number is out of range"): + meta = extract_meta(self.doc, "Section One", 0) + assert len(meta) == 0 + + meta = extract_meta(self.doc, "Section One", 7) + assert len(meta) == 0 + + with it("can match text on any page when page number is not specified"): + meta = extract_meta(self.doc, "The End") + assert len(meta) == 1 + + m = meta[0] + assert m['text'] == "The End" + assert 'font' in m + assert 'CMBX12' in m['font'] + +with description("dump_meta:") as self: + with before.all: + self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) + self.expected_meta = { + 'font': { + 'name': 'CMBX12', + 'size': 14.346199989318848, + 'color': 0x000000, + 'superscript': False, + 'italic': False, + 'serif': True, + 'monospace': False, + 'bold': True + }, + 'bbox': { + 'left': 157.98439025878906, + 'top': 237.6484375, + 'right': 243.12905883789062, + 'bottom': 252.00897216796875 + } + } + + with it("produces valid toml"): + meta = extract_meta(self.doc, "Section One", 1) + assert len(meta) == 1 + + meta_dict = toml.loads(dump_meta(meta[0])) + assert meta_dict == self.expected_meta + + +with description("dump_toml:") as self: + with before.all: + self.doc = fitz.open(os.path.join(dirpath, "files/level2.pdf")) + self.expected_recipe = { + 'heading': [ + { + 'level': 1, + 'greedy': True, + 'font': { + 'name': 'CMBX12', + 'size': 14.346199989318848, + } + } + ] + } + + with it("produces valid toml"): + meta = extract_meta(self.doc, "Section One", 1) + assert len(meta) == 1 + + meta_dict = toml.loads(dump_toml(meta[0], 1)) + assert meta_dict == self.expected_recipe + + with it("strips font subset correctly"): + with_subset = { + 'font': "subset+font", + 'size': 1, + 'flags': 20, + 'color': 0, + 'bbox': (1, 2, 3, 4), + 'text': "" + } + + without_subset = { + 'font': "font", + 'size': 1, + 'flags': 20, + 'color': 0, + 'bbox': (1, 2, 3, 4), + 'text': "" + } + + expected = { + 'heading': [ + { + 'level': 1, + 'greedy': True, + 'font': { + 'name': 'font', + 'size': 1 + } + } + ] + } + + double_plus = { + 'font': "subset+font+font", + 'size': 1, + 'flags': 20, + 'color': 0, + 'bbox': (1, 2, 3, 4), + 'text': "" + } + + expected2 = { + 'heading': [ + { + 'level': 1, + 'greedy': True, + 'font': { + 'name': 'font+font', + 'size': 1 + } + } + ] + } + + assert toml.loads(dump_toml(with_subset, 1)) == expected + assert toml.loads(dump_toml(without_subset, 1)) == expected + assert toml.loads(dump_toml(double_plus, 1)) == expected2 diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/utils/find_by_font.py b/utils/find_by_font.py new file mode 100644 index 0000000000000000000000000000000000000000..e1a411f3f315203e1fed8f3184e7987942162aea --- /dev/null +++ b/utils/find_by_font.py @@ -0,0 +1,41 @@ +import sys +import fitz # PyMuPDF +import math + +def main(): + if len(sys.argv) < 3: + print("Usage: python find_by_font.py [font_size]") + sys.exit(1) + + pdf_path = sys.argv[1] + target_font = sys.argv[2] + target_size = float(sys.argv[3]) if len(sys.argv) > 3 else None + + doc = fitz.open(pdf_path) + + print(f"Searching for:") + print(f" Font: '{target_font}'") + print(f" Size: {target_size if target_size else 'ANY'}") + print("-" * 60) + print(f"{'PAGE':<6} {'SIZE':<8} {'TEXT'}") + print("-" * 60) + + for page in doc: + blocks = page.get_text("dict")["blocks"] + for b in blocks: + if "lines" not in b: continue + for l in b["lines"]: + for s in l["spans"]: + # Check Font Name (partial match or exact?) + # Let's do partial match to be friendly + if target_font.lower() in s["font"].lower(): + + # Check Size (with tolerance) + if target_size: + if not math.isclose(s["size"], target_size, rel_tol=1e-2): + continue + + print(f"{page.number + 1:<6} {s['size']:<8.2f} '{s['text']}'") + +if __name__ == "__main__": + main() diff --git a/utils/find_preceding.py b/utils/find_preceding.py new file mode 100644 index 0000000000000000000000000000000000000000..7208bd2e02e9fa5eee171e7a59c1432202edf5b9 --- /dev/null +++ b/utils/find_preceding.py @@ -0,0 +1,60 @@ +import sys +import fitz # PyMuPDF + +def main(): + if len(sys.argv) < 3: + print("Usage: python find_preceding.py \"\"") + sys.exit(1) + + pdf_path = sys.argv[1] + anchor = sys.argv[2] + + doc = fitz.open(pdf_path) + prev_span = None + prev_page_num = -1 + + found = False + + print(f"Searching for anchor containing: '{anchor}'") + + for page in doc: + blocks = page.get_text("dict")["blocks"] + for b in blocks: + if "lines" not in b: continue + for l in b["lines"]: + for s in l["spans"]: + current_text = s["text"] + + if anchor in current_text: + print(f"\n[!] MATCH FOUND on Page {page.number + 1}") + print(f" Anchor Span Text: '{current_text.strip()}'") + + if prev_span: + print(f"\n--- PRECEDING ELEMENT DETAILS ---") + print(f"Text: '{prev_span['text']}'") + print(f"Page: {prev_page_num}") + print(f"Font: {prev_span['font']}") + print(f"Size: {prev_span['size']:.4f}") + print(f"Color: {hex(prev_span['color'])}") + print(f"BBox: {prev_span['bbox']}") + print(f"Flags: {prev_span['flags']}") + + # Helper for recipe creation + print(f"\n--- SUGGESTED RECIPE FILTER ---") + print(f"[[heading]]") + print(f"font.name = \"{prev_span['font']}\"") + print(f"font.size = {prev_span['size']}") + else: + print("\n[!] No preceding text element found (this might be the first element).") + + found = True + + # Update tracker + prev_span = s + prev_page_num = page.number + 1 + + if not found: + print(f"\nAnchor text '{anchor}' not found in document.") + +if __name__ == "__main__": + main() diff --git a/utils/inspect_bytes.py b/utils/inspect_bytes.py new file mode 100644 index 0000000000000000000000000000000000000000..72d95cd0ec5d68661b3031542cea18c11fc3c43e --- /dev/null +++ b/utils/inspect_bytes.py @@ -0,0 +1,32 @@ +import sys +import fitz # PyMuPDF + +def main(): + if len(sys.argv) < 3: + print("Usage: python inspect_bytes.py \"\"") + sys.exit(1) + + pdf_path = sys.argv[1] + search_str = sys.argv[2] + + doc = fitz.open(pdf_path) + + print(f"Searching for string containing: '{search_str}'") + print("-" * 60) + + for page in doc: + blocks = page.get_text("dict")["blocks"] + for b in blocks: + if "lines" not in b: continue + for l in b["lines"]: + for s in l["spans"]: + text = s["text"] + if search_str in text: + print(f"Page {page.number + 1}:") + print(f" Visual: '{text}'") + print(f" Raw: {ascii(text)}") + print(f" Hex: { [hex(ord(c)) for c in text] }") + print("-" * 20) + +if __name__ == "__main__": + main() diff --git a/utils/list_longest_fonts.py b/utils/list_longest_fonts.py new file mode 100644 index 0000000000000000000000000000000000000000..eb320760495297b1ef4f9c01f6cb78f1653dd3c7 --- /dev/null +++ b/utils/list_longest_fonts.py @@ -0,0 +1,54 @@ +import sys +import fitz # PyMuPDF +import heapq + +def main(): + if len(sys.argv) < 2: + print("Usage: python list_largest_fonts.py ") + sys.exit(1) + + doc = fitz.open(sys.argv[1]) + + # We will keep a heap of the top N largest items + # Storing tuples of (size, page_num, text, font_name) + # We use a list and sort it at the end for simplicity since N is small + candidates = [] + + print(f"Scanning {len(doc)} pages...") + + for page in doc: + blocks = page.get_text("dict")["blocks"] + label = page.get_label() + for b in blocks: + if "lines" in b: + for l in b["lines"]: + for s in l["spans"]: + text = s["text"].strip() + if not text: + continue + + # Add to candidates + candidates.append({ + "size": s["size"], + "text": text[:50], # Truncate for display + "page": page.number + 1, + "label": label, + "font": s["font"] + }) + + # Sort descending by size + candidates.sort(key=lambda x: x["size"], reverse=True) + + # Deduplicate based on (size, font) to avoid spamming the same header style + # But we want to see different text instances. + # Let's just show top 20 raw entries. + + print(f"\n--- TOP 25 LARGEST TEXT SPANS ---") + print(f"{'SIZE (pt)':<10} {'IDX':<6} {'LABEL':<8} {'FONT':<25} {'TEXT'}") + print("-" * 75) + + for c in candidates[:25]: + print(f"{c['size']:<10.2f} {c['page']:<6} {c['label']:<8} {c['font']:<25} '{c['text']}'") + +if __name__ == "__main__": + main() diff --git a/utils/modify_toc.py b/utils/modify_toc.py new file mode 100644 index 0000000000000000000000000000000000000000..cbb3e221e65885c71db647eb702e5fd5baebbd13 --- /dev/null +++ b/utils/modify_toc.py @@ -0,0 +1,61 @@ +import sys +import re +import io + +def clean_text(text): + # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) + # Also collapses multiple spaces + text = text.replace('\xa0', ' ').replace('\xad', '') + # Replace en-dash and em-dash with standard hyphen + text = text.replace('\u2013', '-').replace('\u2014', '-') + # Remove control characters (except allowed ones, though likely not needed for titles) + text = "".join(ch for ch in text if ch.isprintable()) + return ' '.join(text.split()) + +def main(): + # Force UTF-8 for stdin/stdout to handle special characters on Windows + # otherwise it defaults to cp1252/cp437 which mangles unicode + stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors='replace') + stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace') + + # Regex to match ToC lines + # Captures: + # 1. Indentation (leading spaces) + # 2. Title (inside quotes) + # 3. Page Number + # 4. Trailing content (like vpos) + pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') + + idx = 0 + + for line in stdin: + # Strip newline for processing + line_content = line.rstrip('\n') + if not line_content: + stdout.write("\n") + continue + + match = pattern.match(line_content) + if match: + indent = match.group(1) + old_title = match.group(2) + page_num = match.group(3) + trailing = match.group(4) + + # Sanitize the title (fix weird spaces/hyphens) + cleaned_title = clean_text(old_title) + + # Format: 000_Title_pgX + new_title = f"{idx:03d}_{cleaned_title}_pg{page_num}" + + # Reconstruct the line + new_line = f'{indent}"{new_title}" {page_num}{trailing}' + + stdout.write(new_line + "\n") + idx += 1 + else: + # If line doesn't match expected format, print as is + stdout.write(line_content + "\n") + +if __name__ == "__main__": + main() diff --git a/utils/split_by_toc.py b/utils/split_by_toc.py new file mode 100644 index 0000000000000000000000000000000000000000..b77e30a573db6f973a2295bfc69085cdeb453d79 --- /dev/null +++ b/utils/split_by_toc.py @@ -0,0 +1,91 @@ +import fitz # PyMuPDF +import sys +import os +import re + +def main(): + if len(sys.argv) < 3: + print("Usage: python split_by_toc.py [output_dir]") + sys.exit(1) + + # Force UTF-8 for stdout/stderr + sys.stdout.reconfigure(encoding='utf-8') + sys.stderr.reconfigure(encoding='utf-8') + + pdf_path = sys.argv[1] + toc_path = sys.argv[2] + output_dir = sys.argv[3] if len(sys.argv) > 3 else "split_output" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + print(f"Splitting '{pdf_path}' based on '{toc_path}'...") + + # 1. Parse ToC + # We need a list of (Title, StartPage) + entries = [] + # Regex to match modify_toc style output: "Title" Page ... + # Also matches standard pdftocgen style + pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') + + with open(toc_path, 'r', encoding='utf-8') as f: + for line in f: + if not line.strip(): continue + match = pattern.match(line) + if match: + title = match.group(2) + page = int(match.group(3)) + entries.append((title, page)) + + if not entries: + print("Error: No ToC entries found.") + sys.exit(1) + + # 2. Open PDF + doc = fitz.open(pdf_path) + total_pages = doc.page_count + + print(f"Total Pages: {total_pages}") + print(f"Found {len(entries)} chapters.") + print("-" * 40) + + # 3. Iterate and Split + for i, (title, start_page) in enumerate(entries): + # PyMuPDF uses 0-based indexing, ToC uses 1-based logic usually + # But wait, pdftocgen output is 1-based visual page numbers. + # So StartIndex = start_page - 1 + + start_idx = start_page - 1 + + # Determine End Page + if i < len(entries) - 1: + next_start_page = entries[i+1][1] + end_idx = next_start_page - 1 - 1 # One page before next chapter + else: + end_idx = total_pages - 1 + + # Safety check for weird overlaps or empty ranges + if start_idx > end_idx: + # Maybe bookmarks are out of order or on same page + # Just grab the single page + end_idx = start_idx + + filename = f"{title}.pdf" + # Sanitize filename (remove forbidden chars like slash, colon) + filename = re.sub(r'[<>:"/\\|?*]', '_', filename) + out_path = os.path.join(output_dir, filename) + + print(f"[{i+1}/{len(entries)}] {title}") + print(f" Pages {start_page} to {end_idx + 1} (Count: {end_idx - start_idx + 1})") + + # Create new PDF for this chapter + new_doc = fitz.open() + new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) + new_doc.save(out_path) + new_doc.close() + + print("-" * 40) + print(f"Done! Files saved to '{output_dir}/'") + +if __name__ == "__main__": + main() diff --git a/utils/split_pdf.py b/utils/split_pdf.py new file mode 100644 index 0000000000000000000000000000000000000000..28a614369d6d040284c04e484e07dd4af62433cf --- /dev/null +++ b/utils/split_pdf.py @@ -0,0 +1,72 @@ +import fitz +import sys +import os +import re + +def main(): + if len(sys.argv) < 2: + print("Usage: python split_pdf.py [output_dir]") + sys.exit(1) + + # Force UTF-8 for stdout/stderr + sys.stdout.reconfigure(encoding='utf-8') + sys.stderr.reconfigure(encoding='utf-8') + + pdf_path = sys.argv[1] + output_dir = sys.argv[2] if len(sys.argv) > 2 else "split_output" + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + print(f"Reading bookmarks from '{pdf_path}'...") + + doc = fitz.open(pdf_path) + toc = doc.get_toc() # [[lvl, title, page_num, ...], ...] + + if not toc: + print("Error: No bookmarks found in this PDF.") + sys.exit(1) + + # Filter for Level 1 bookmarks (Top-level chapters) + chapters = [entry for entry in toc if entry[0] == 1] + + print(f"Found {len(chapters)} top-level chapters.") + + total_pages = doc.page_count + + for i, (lvl, title, start_page, *_) in enumerate(chapters): + # Calculate End Page + # Look for the start of the NEXT chapter (even if it's nested, actually usually verify against next level 1? + # Standard logic: Chapter 1 ends where Chapter 2 begins. + + # We need the index of this entry in the full TOC to find the next meaningful boundary + # But simpler: The next Level 1 defines the end of this Level 1 block. + + start_idx = start_page - 1 + + if i < len(chapters) - 1: + next_start_page = chapters[i+1][2] + end_idx = next_start_page - 1 - 1 + else: + end_idx = total_pages - 1 + + # Sanity check + if end_idx < start_idx: + end_idx = start_idx + + filename = f"{title}.pdf" + # Sanitize + filename = re.sub(r'[<>:"/\\|?*]', '_', filename).strip() + out_path = os.path.join(output_dir, filename) + + print(f"Extracting: {filename} (Pages {start_page}-{end_idx+1})") + + new_doc = fitz.open() + new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) + new_doc.save(out_path) + new_doc.close() + + print(f"Done. Files saved to {output_dir}/") + +if __name__ == "__main__": + main() diff --git a/utils/test_toc_processor.py b/utils/test_toc_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..b8fbbd9b9cfaf9984c46db5e6943642929b4092e --- /dev/null +++ b/utils/test_toc_processor.py @@ -0,0 +1,46 @@ +import unittest +from toc_processor import merge_same_page_headers, clean_text, parse_raw_toc_output + +class TestTOCProcessor(unittest.TestCase): + + def test_merge_same_page_headers(self): + # Scenario: "American Government..." (Page 31) followed by "Divided World" (Page 31) + input_toc = [ + [1, "Chapter 1 Intro", 5], + [1, "American Government and Politics in a Racially", 31], + [1, "Divided World", 31], + [1, "Chapter 2", 57] + ] + + expected_toc = [ + [1, "Chapter 1 Intro", 5], + [1, "American Government and Politics in a Racially Divided World", 31], + [1, "Chapter 2", 57] + ] + + result = merge_same_page_headers(input_toc) + + print(f"\nInput: {[e[1] for e in input_toc]}") + print(f"Result: {[e[1] for e in result]}") + + self.assertEqual(len(result), 3) + self.assertEqual(result[1][1], "American Government and Politics in a Racially Divided World") + self.assertEqual(result[1][2], 31) + + def test_merge_same_page_headers_mixed_levels(self): + # Scenario: Level 1 followed by Level 2 on same page (Should NOT merge) + input_toc = [ + [1, "Chapter 1", 10], + [2, "Section 1.1", 10] + ] + + result = merge_same_page_headers(input_toc) + self.assertEqual(len(result), 2) + + def test_clean_text(self): + dirty = "Hello\xa0World\xad" + clean = clean_text(dirty) + self.assertEqual(clean, "Hello World") + +if __name__ == '__main__': + unittest.main() diff --git a/utils/toc_processor.py b/utils/toc_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..fdf546df948375312943a31e4c3b7855dedd1c15 --- /dev/null +++ b/utils/toc_processor.py @@ -0,0 +1,242 @@ +""" +TOC Processor +------------- +Handles operations related to the Table of Contents (TOC) for the PDF pipeline. +Includes functionality for: +- Cleaning and sanitizing text (encoding issues, soft hyphens) +- Merging usage-heuristic headers (e.g. multi-line headers on same page) +- generating split PDF chapters +""" + +import re +import io +import zipfile +import fitz # PyMuPDF +from typing import List, Tuple, Generator, Optional +import tempfile +import os + +# Type alias for TOC entry: [level, title, page, ...] +FitZTOCEntry = list + +def clean_text(text: str) -> str: + """ + Sanitize text to remove common PDF artifacts. + Removes soft hyphens, fixes non-breaking spaces, and standardizes dashes. + """ + if not text: + return "" + + # Replace non-breaking spaces (\xa0) and soft hyphens (\xad) + text = text.replace('\xa0', ' ').replace('\xad', '') + + # Replace en-dash and em-dash with standard hyphen + text = text.replace('\u2013', '-').replace('\u2014', '-') + + # Remove control characters (except allowed ones) + text = "".join(ch for ch in text if ch.isprintable()) + + return ' '.join(text.split()) + +def parse_raw_toc_output(raw_output: str) -> List[FitZTOCEntry]: + """ + Parses the raw text output from `pdftocgen` or `pdftocio` into a structured list. + Expected format lines: ' "Chapter Title" 123' + """ + toc = [] + # Regex captures: 1=Indent, 2=Title, 3=PageNum + pattern = re.compile(r'^(\s*)"(.*)"\s+(\d+)(.*)$') + + for line in raw_output.splitlines(): + match = pattern.match(line) + if match: + indent, title, page_str, _ = match.groups() + + # Calculate level based on indentation (4 spaces = 1 indent step) + # 0 spaces = Lvl 1, 4 spaces = Lvl 2, etc. + # pdftocgen defaults to standard indentation + level = (len(indent) // 4) + 1 + page = int(page_str) + + toc.append([level, title, page]) + + return toc + +def merge_same_page_headers(toc: List[FitZTOCEntry]) -> List[FitZTOCEntry]: + """ + Detects consecutive Level 1 headers derived from the same page and merges them. + This fixes the "double split" issue where multi-line headers are detected as separate entries. + + Example: + Input: [[1, "Title Part 1", 10], [1, "Title Part 2", 10]] + Output: [[1, "Title Part 1 Title Part 2", 10]] + """ + if not toc: + return [] + + merged_toc = [] + + for entry in toc: + level, title, page = entry[0], entry[1], entry[2] + + # We only care about merging Level 1 headers + if level != 1: + merged_toc.append(entry) + continue + + # Check if we can merge with the previous entry + if merged_toc: + prev_entry = merged_toc[-1] + prev_level, prev_title, prev_page = prev_entry[0], prev_entry[1], prev_entry[2] + + # CRITERIA: Both Level 1, Same Page + if prev_level == 1 and prev_page == page: + # Merge! Update the previous entry's title + new_title = f"{prev_title} {title}" + merged_toc[-1][1] = new_title + continue + + # If no merge, append as new + merged_toc.append(entry) + + return merged_toc + +def process_toc(raw_toc_content: str) -> str: + """ + Full pipeline to clean and format raw TOC content. + Returns the string content formatted for `pdftocio` input (with indices). + """ + # 1. Parse + parsed_toc = parse_raw_toc_output(raw_toc_content) + + # 2. Clean Titles + for entry in parsed_toc: + entry[1] = clean_text(entry[1]) + + # 3. Merge Same-Page Headers (The Double Split Fix) + merged_toc = merge_same_page_headers(parsed_toc) + + # 4. Format for Output (re-serialize) + # pdftocio expects: "Title" PageNum + # DECOUPLED: We keep the PDF bookmarks clean (no number prefix). + # File naming handling is moved to generate_chapter_splits. + + output_lines = [] + + for entry in merged_toc: + level, title, page = entry[0], entry[1], entry[2] + + # Indent: 4 spaces per level minus 1 + indent = " " * (4 * (level - 1)) + output_lines.append(f'{indent}"{title}" {page}') + + return "\n".join(output_lines) + +def generate_chapter_splits(input_pdf_path: str, output_zip_path: str, back_matter_start_page: Optional[int] = None): + """ + Splits the PDF based on Level 1 TOC entries and writes a ZIP file to the output path. + Uses tempfile logic to handle large files safely. + + Args: + input_pdf_path: Path to source PDF + output_zip_path: Path to write the ZIP + back_matter_start_page: 1-based page number where Back Matter starts. + Chapters will be clamped to end before this page. + Content from this page to end will be saved as 999_Back_Matter.pdf. + """ + doc = fitz.open(input_pdf_path) + toc = doc.get_toc() + + if not toc: + doc.close() + raise ValueError("No Table of Contents found in the PDF.") + + # Create the zip file + with zipfile.ZipFile(output_zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: + total_pages = doc.page_count + + # --- Front Matter Extraction --- + # Find the first Level 1 chapter + first_l1_page = None + for entry in toc: + if entry[0] == 1: + first_l1_page = entry[2] + break + + # If the first chapter starts after Page 1, extract Front Matter + if first_l1_page and first_l1_page > 1: + # Front matter is from page 0 to (first_l1_page - 1) - 1 (index) + fm_end_idx = first_l1_page - 2 + + if fm_end_idx >= 0: + fm_doc = fitz.open() + fm_doc.insert_pdf(doc, from_page=0, to_page=fm_end_idx) + zf.writestr("000_Front_matter.pdf", fm_doc.tobytes()) + fm_doc.close() + + # --- Chapter Extraction --- + chapter_idx = 1 + + for i, entry in enumerate(toc): + level, title, start_page = entry[0], entry[1], entry[2] + + # We skip non-L1 for splitting functionality + if level != 1: + continue + + # If this chapter starts AT or AFTER the back matter, skip it (it's inside back matter) + if back_matter_start_page and start_page >= back_matter_start_page: + continue + + start_idx = start_page - 1 + + # Determine end page lookahead + end_page = total_pages + for next_entry in toc[i+1:]: + if next_entry[0] == 1: + # The start of the next chapter is the end of this one + end_page = next_entry[2] - 1 + break + + # --- CLAMPING: Check against Back Matter --- + if back_matter_start_page: + # If the *natural* end of this chapter goes into back matter, cut it short. + # The cut point is back_matter_start_page - 1. + # Example: Back Matter starts Pg 100. Chapter ends naturally Pg 105. Clamp to Pg 99. + if end_page >= back_matter_start_page: + end_page = back_matter_start_page - 1 + + end_idx = end_page - 1 + + # Safety clamp + if end_idx < start_idx: + end_idx = start_idx + + # Create sub-document + new_doc = fitz.open() + new_doc.insert_pdf(doc, from_page=start_idx, to_page=end_idx) + + # Sanitize filename + safe_title = "".join([c for c in title if c.isalnum() or c in (' ', '-', '_')]).strip() + if not safe_title: + safe_title = f"chapter_{chapter_idx}" + + # Formatting: 001_Title_pgX.pdf + pdf_name = f"{chapter_idx:03d}_{safe_title}_pg{start_page}.pdf" + chapter_idx += 1 + + # Write to zip + zf.writestr(pdf_name, new_doc.tobytes()) + new_doc.close() + + # --- Back Matter Generation --- + if back_matter_start_page and back_matter_start_page <= total_pages: + bm_start_idx = back_matter_start_page - 1 + bm_end_idx = total_pages - 1 + + bm_doc = fitz.open() + bm_doc.insert_pdf(doc, from_page=bm_start_idx, to_page=bm_end_idx) + zf.writestr("999_Back_matter.pdf", bm_doc.tobytes()) + bm_doc.close() + + doc.close()