diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/AUTHORS.md b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/AUTHORS.md new file mode 100644 index 0000000000000000000000000000000000000000..baa8504e2036be53e3fcaaaa4f2ea76d12773322 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/AUTHORS.md @@ -0,0 +1,313 @@ +# Natural Language Toolkit (NLTK) Authors + +## Original Authors + +- Steven Bird +- Edward Loper +- Ewan Klein + +## Contributors + +- Tom Aarsen +- Rami Al-Rfou' +- Mark Amery +- Greg Aumann +- Ivan Barria +- Ingolf Becker +- Yonatan Becker +- Paul Bedaride +- Steven Bethard +- Robert Berwick +- Dan Blanchard +- Nathan Bodenstab +- Alexander Böhm +- Francis Bond +- Paul Bone +- Jordan Boyd-Graber +- Daniel Blanchard +- Phil Blunsom +- Lars Buitinck +- Cristian Capdevila +- Steve Cassidy +- Chen-Fu Chiang +- Dmitry Chichkov +- Jinyoung Choi +- Andrew Clausen +- Lucas Champollion +- Graham Christensen +- Trevor Cohn +- David Coles +- Tom Conroy +- Claude Coulombe +- Lucas Cooper +- Robin Cooper +- Chris Crowner +- James Curran +- Arthur Darcet +- Dariel Dato-on +- Selina Dennis +- Leon Derczynski +- Alexis Dimitriadis +- Nikhil Dinesh +- Liang Dong +- David Doukhan +- Rebecca Dridan +- Pablo Duboue +- Long Duong +- Christian Federmann +- Campion Fellin +- Michelle Fullwood +- Dan Garrette +- Maciej Gawinecki +- Jean Mark Gawron +- Sumukh Ghodke +- Yoav Goldberg +- Michael Wayne Goodman +- Dougal Graham +- Brent Gray +- Simon Greenhill +- Clark Grubb +- Eduardo Pereira Habkost +- Masato Hagiwara +- Lauri Hallila +- Michael Hansen +- Yurie Hara +- Will Hardy +- Tyler Hartley +- Peter Hawkins +- Saimadhav Heblikar +- Fredrik Hedman +- Helder +- Michael Heilman +- Ofer Helman +- Christopher Hench +- Bruce Hill +- Amy Holland +- Kristy Hollingshead +- Marcus Huderle +- Baden Hughes +- Nancy Ide +- Rebecca Ingram +- Edward Ivanovic +- Thomas Jakobsen +- Nick Johnson +- Eric Kafe +- Piotr Kasprzyk +- Angelos Katharopoulos +- Sudharshan Kaushik +- Chris Koenig +- Mikhail Korobov +- Denis Krusko +- Ilia Kurenkov +- Stefano Lattarini +- Pierre-François Laquerre +- Stefano Lattarini +- Haejoong Lee +- Jackson Lee +- Max Leonov +- Chris Liechti +- Hyuckin David Lim +- Tom Lippincott +- Peter Ljunglöf +- Alex Louden +- Joseph Lynch +- Nitin Madnani +- Felipe Madrigal +- Bjørn Mæland +- Dean Malmgren +- Christopher Maloof +- Rob Malouf +- Iker Manterola +- Carl de Marcken +- Mitch Marcus +- Torsten Marek +- Robert Marshall +- Marius Mather +- Duncan McGreggor +- David McClosky +- Xinfan Meng +- Dmitrijs Milajevs +- Margaret Mitchell +- Tomonori Nagano +- Jason Narad +- Shari A’aidil Nasruddin +- Lance Nathan +- Morten Neergaard +- David Nemeskey +- Eric Nichols +- Joel Nothman +- Alireza Nourian +- Alexander Oleynikov +- Pierpaolo Pantone +- Ted Pedersen +- Jacob Perkins +- Alberto Planas +- Ondrej Platek +- Alessandro Presta +- Qi Liu +- Martin Thorsen Ranang +- Michael Recachinas +- Brandon Rhodes +- Joshua Ritterman +- Will Roberts +- Stuart Robinson +- Carlos Rodriguez +- Lorenzo Rubio +- Alex Rudnick +- Jussi Salmela +- Geoffrey Sampson +- Kepa Sarasola +- Kevin Scannell +- Nathan Schneider +- Rico Sennrich +- Thomas Skardal +- Eric Smith +- Lynn Soe +- Rob Speer +- Peter Spiller +- Richard Sproat +- Ceri Stagg +- Peter Stahl +- Oliver Steele +- Thomas Stieglmaier +- Jan Strunk +- Liling Tan +- Claire Taylor +- Louis Tiao +- Steven Tomcavage +- Tiago Tresoldi +- Marcus Uneson +- Yu Usami +- Petro Verkhogliad +- Peter Wang +- Zhe Wang +- Charlotte Wilson +- Chuck Wooters +- Steven Xu +- Beracah Yankama +- Lei Ye (叶磊) +- Patrick Ye +- Geraldine Sim Wei Ying +- Jason Yoder +- Thomas Zieglier +- 0ssifrage +- ducki13 +- kiwipi +- lade +- isnowfy +- onesandzeros +- pquentin +- wvanlint +- Álvaro Justen +- bjut-hz +- Sergio Oller +- Will Monroe +- Elijah Rippeth +- Emil Manukyan +- Casper Lehmann-Strøm +- Andrew Giel +- Tanin Na Nakorn +- Linghao Zhang +- Colin Carroll +- Heguang Miao +- Hannah Aizenman (story645) +- George Berry +- Adam Nelson +- J Richard Snape +- Alex Constantin +- Tsolak Ghukasyan +- Prasasto Adi +- Safwan Kamarrudin +- Arthur Tilley +- Vilhjalmur Thorsteinsson +- Jaehoon Hwang +- Chintan Shah +- sbagan +- Zicheng Xu +- Albert Au Yeung +- Shenjian Zhao +- Deng Wang +- Ali Abdullah +- Stoytcho Stoytchev +- Lakhdar Benzahia +- Kheireddine Abainia +- Yibin Lin +- Artiem Krinitsyn +- Björn Mattsson +- Oleg Chislov +- Pavan Gururaj Joshi +- Ethan Hill +- Vivek Lakshmanan +- Somnath Rakshit +- Anlan Du +- Pulkit Maloo +- Brandon M. Burroughs +- John Stewart +- Iaroslav Tymchenko +- Aleš Tamchyna +- Tim Gianitsos +- Philippe Partarrieu +- Andrew Owen Martin +- Adrian Ellis +- Nat Quayle Nelson +- Yanpeng Zhao +- Matan Rak +- Nick Ulle +- Uday Krishna +- Osman Zubair +- Viresh Gupta +- Ondřej Cífka +- Iris X. Zhou +- Devashish Lal +- Gerhard Kremer +- Nicolas Darr +- Hervé Nicol +- Alexandre H. T. Dias +- Daksh Shah +- Jacob Weightman +- Bonifacio de Oliveira +- Armins Bagrats Stepanjans +- Vassilis Palassopoulos +- Ram Rachum +- Or Sharir +- Denali Molitor +- Jacob Moorman +- Cory Nezin +- Matt Chaput +- Danny Sepler +- Akshita Bhagia +- Pratap Yadav +- Hiroki Teranishi +- Ruben Cartuyvels +- Dalton Pearson +- Robby Horvath +- Gavish Poddar +- Saibo Geng +- Ahmet Yildirim +- Yuta Nakamura +- Adam Hawley +- Panagiotis Simakis +- Richard Wang +- Alexandre Perez-Lebel +- Fernando Carranza +- Martin Kondratzky +- Heungson Lee +- M.K. Pawelkiewicz +- Steven Thomas Smith +- Jan Lennartz + +## Others whose work we've taken and included in NLTK, but who didn't directly contribute it: + +### Contributors to the Porter Stemmer + +- Martin Porter +- Vivake Gupta +- Barry Wilkins +- Hiranmay Ghosh +- Chris Emerson + +### Authors of snowball arabic stemmer algorithm + +- Assem Chelli +- Abdelkrim Aries +- Lakhdar Benzahia diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/LICENSE.txt b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..75b52484ea471f882c29e02693b4f02dba175b5e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/PKG-INFO b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..7b4dc6c97856f67b0b39b2728dd1826b6e32d0ed --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/PKG-INFO @@ -0,0 +1,66 @@ +Metadata-Version: 2.1 +Name: nltk +Version: 3.8 +Summary: Natural Language Toolkit +Home-page: https://www.nltk.org/ +Author: NLTK Team +Author-email: nltk.team@gmail.com +Maintainer: NLTK Team +Maintainer-email: nltk.team@gmail.com +License: Apache License, Version 2.0 +Project-URL: Documentation, https://www.nltk.org/ +Project-URL: Source Code, https://github.com/nltk/nltk +Project-URL: Issue Tracker, https://github.com/nltk/nltk/issues +Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Education +Classifier: Intended Audience :: Information Technology +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: Apache Software License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Topic :: Scientific/Engineering +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces +Classifier: Topic :: Scientific/Engineering :: Information Analysis +Classifier: Topic :: Text Processing +Classifier: Topic :: Text Processing :: Filters +Classifier: Topic :: Text Processing :: General +Classifier: Topic :: Text Processing :: Indexing +Classifier: Topic :: Text Processing :: Linguistic +Requires-Python: >=3.7 +Requires-Dist: click +Requires-Dist: joblib +Requires-Dist: regex (>=2021.8.3) +Requires-Dist: tqdm +Provides-Extra: all +Requires-Dist: numpy ; extra == 'all' +Requires-Dist: python-crfsuite ; extra == 'all' +Requires-Dist: scikit-learn ; extra == 'all' +Requires-Dist: pyparsing ; extra == 'all' +Requires-Dist: scipy ; extra == 'all' +Requires-Dist: twython ; extra == 'all' +Requires-Dist: matplotlib ; extra == 'all' +Requires-Dist: requests ; extra == 'all' +Provides-Extra: corenlp +Requires-Dist: requests ; extra == 'corenlp' +Provides-Extra: machine_learning +Requires-Dist: numpy ; extra == 'machine_learning' +Requires-Dist: python-crfsuite ; extra == 'machine_learning' +Requires-Dist: scikit-learn ; extra == 'machine_learning' +Requires-Dist: scipy ; extra == 'machine_learning' +Provides-Extra: plot +Requires-Dist: matplotlib ; extra == 'plot' +Provides-Extra: tgrep +Requires-Dist: pyparsing ; extra == 'tgrep' +Provides-Extra: twitter +Requires-Dist: twython ; extra == 'twitter' + +The Natural Language Toolkit (NLTK) is a Python package for +natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10. + diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/README.md b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/README.md new file mode 100644 index 0000000000000000000000000000000000000000..27a18bc7372d1aaa9cbb7b95d882df199fc43c09 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/README.md @@ -0,0 +1,50 @@ +# Natural Language Toolkit (NLTK) +[![PyPI](https://img.shields.io/pypi/v/nltk.svg)](https://pypi.python.org/pypi/nltk) +![CI](https://github.com/nltk/nltk/actions/workflows/ci.yaml/badge.svg?branch=develop) + +NLTK -- the Natural Language Toolkit -- is a suite of open source Python +modules, data sets, and tutorials supporting research and development in Natural +Language Processing. NLTK requires Python version 3.7, 3.8, 3.9 or 3.10. + +For documentation, please visit [nltk.org](https://www.nltk.org/). + + +## Contributing + +Do you want to contribute to NLTK development? Great! +Please read [CONTRIBUTING.md](CONTRIBUTING.md) for more details. + +See also [how to contribute to NLTK](https://www.nltk.org/contribute.html). + + +## Donate + +Have you found the toolkit helpful? Please support NLTK development by donating +to the project via PayPal, using the link on the NLTK homepage. + + +## Citing + +If you publish work that uses NLTK, please cite the NLTK book, as follows: + + Bird, Steven, Edward Loper and Ewan Klein (2009). + Natural Language Processing with Python. O'Reilly Media Inc. + + +## Copyright + +Copyright (C) 2001-2022 NLTK Project + +For license information, see [LICENSE.txt](LICENSE.txt). + +[AUTHORS.md](AUTHORS.md) contains a list of everyone who has contributed to NLTK. + + +### Redistributing + +- NLTK source code is distributed under the Apache 2.0 License. +- NLTK documentation is distributed under the Creative Commons + Attribution-Noncommercial-No Derivative Works 3.0 United States license. +- NLTK corpora are provided under the terms given in the README file for each + corpus; all are redistributable and available for non-commercial use. +- NLTK may be freely redistributed, subject to the provisions of these licenses. diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/RECORD b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/RECORD new file mode 100644 index 0000000000000000000000000000000000000000..ebdeaeaa7e4e3070493d9ec1ae1ef45615036382 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/RECORD @@ -0,0 +1,424 @@ +nltk/VERSION,sha256=jafAevDIiL7Z5ft9u5aRiMshWGFUdTdY0ucHs9t_Gvw,5 +nltk/__init__.py,sha256=qxCSs7zbRGWYWeO3d8B-A-IGhtXsqmsgOkqSL2Kx0bA,6379 +nltk/book.py,sha256=JWsrbLQakH3rVBQxz5hqYRM6PlTcpFPwlzJT5Gy8IMQ,3912 +nltk/cli.py,sha256=EHbvaA6evjSsc2wwwC4IkcaT4Whx3OHpUCsC5SuHpQM,1897 +nltk/collections.py,sha256=dnBFmM9kU3Puz91S0hYfYgdXHB-vFK3In33SRPacgHw,23673 +nltk/collocations.py,sha256=VbWVVGELA9DUZA2FRyk4wnPeFAvVnB7G1WpXLd93Tec,14964 +nltk/compat.py,sha256=gh6zsmdisyUDlZdcT0maFa0d_zr1EztZQaWHNB-uQZk,1307 +nltk/data.py,sha256=Nbv7i5_OkcafQxjlRV1oWgsMpISJBwJJQJjzQb_vK0U,52776 +nltk/decorators.py,sha256=U1-DvExxy0Uv96M0St_rR8IAh8Em3eK6uS4AXIf_Ti4,8526 +nltk/downloader.py,sha256=yWnqiNK_yyTWyoBX70KbaWXSosrs33eY_MikzSDBUQU,95506 +nltk/featstruct.py,sha256=AcI2P9dsbiBV4HFf4EV7LYHvP8CXSapEYyCbFBuEpD8,106108 +nltk/grammar.py,sha256=SiituYNmuuMy-NjoC3jJJ5a3Huk-7nU_qbjYCpHmjT8,59174 +nltk/help.py,sha256=mfL9PN1hX5KsYnO3GI0wWtMTSaIDqnLWpRXo-ms-iIM,1709 +nltk/internals.py,sha256=U7caoj9_IqI_NX_xb_aHiL7Py5h9PavO9lxkxpA9J5k,39416 +nltk/jsontags.py,sha256=jbeCWNSmZgupf6uHa-5PvDe__jMNlWuRK-tE0alJz3E,1948 +nltk/langnames.py,sha256=726IDLMA0u9__VRysf0pvOC5Q5_H12a9q4yVEpF1h2Q,17952 +nltk/lazyimport.py,sha256=qBI5PNKz5qYLrxow19KwLrSE821TyRhVKCafAGML-1E,4719 +nltk/probability.py,sha256=Wt7WI8xXCystjqVlElxdWPU6ViuAERRDyClRbSImUzc,92907 +nltk/text.py,sha256=OCdVvkTHbNTJUhSGNRGUZCrQ5K4LG_9knG6pOwV_ArQ,28909 +nltk/tgrep.py,sha256=9AgmyBKWq9wvRrLwrfClrCdy7zjUVapZZOYwoBFHfMI,37911 +nltk/toolbox.py,sha256=8DEZiwyImBa4ZM7fQ78SyxcXuF84daKz8BbXfGA5tl8,18337 +nltk/treeprettyprinter.py,sha256=nQ22X8TTxFtiv8Uu0lIO5L8WauHjbQ3drIc2HzEeLpU,975 +nltk/treetransforms.py,sha256=B_0-bNh4gkTu2dY8E5mt8XQdN97IKFL7LrtOMeASRps,5288 +nltk/util.py,sha256=Cb6SG42Ead5UNM_lDXygm2hWX6sV9gdV05tOIQlllVU,42026 +nltk/wsd.py,sha256=VDyLRf-wtfEXmyLNfp88Tc8T8Fhugge3OgSDWhAiq_g,1789 +nltk/app/__init__.py,sha256=2OPD4j-bQl-EhCE09ziWHkWjS16ce4efXx9KxVArNgU,1578 +nltk/app/chartparser_app.py,sha256=pmnPQCI_AR-lBYoxFF059bjRQfWUn7mjwQG96eZm1BM,88195 +nltk/app/chunkparser_app.py,sha256=HAJjscog0G9CD4h1d3McyB9v9y5jWIjQEJWesdFPInI,58322 +nltk/app/collocations_app.py,sha256=oNUkqelbwgSRlFxzL_OKGyq2JCIOAMdrfD1SDMWVQ74,14664 +nltk/app/concordance_app.py,sha256=Ctx1xgl_3h6BP84JkOj5qVamJyNFlBcwSBEWEdNHi5M,24882 +nltk/app/nemo_app.py,sha256=6ZBJXlJWKWoYnsrEy3Yy6IeFxcy3FNaGWK6QnMYEy4E,12305 +nltk/app/rdparser_app.py,sha256=UDba1O-0J0E5ipb38N3ZvIWvSoajxxlT9WrnvftgsEs,37781 +nltk/app/srparser_app.py,sha256=hqKWhLgJ616GrI-vep53mXvnUgauv4KTmnX6Tf8GcJw,34401 +nltk/app/wordfreq_app.py,sha256=1K7HaEbiVd5AZrpko2lQ1c_nCbSPhd4de2xdBLOhWGg,957 +nltk/app/wordnet_app.py,sha256=K5aCABeArI-ZxAZu6kV7mD4IaVZTN_BSEhCu6EfFico,35213 +nltk/ccg/__init__.py,sha256=8Ymq8xc3O0-6QJ97LQ35xCf_ne1mI920tkZoItjIJC8,915 +nltk/ccg/api.py,sha256=vURbaiW4Yw09TV8IrmNVM_5zN8zI4zjHOJE11GEDcAU,10360 +nltk/ccg/chart.py,sha256=ml3AHNENRnYlts2KPMS5XhZUjYPeRFZUsIuxOdakVNY,14147 +nltk/ccg/combinator.py,sha256=_UxsMUU3EctSv7BRd_AQDFpiThirsCMj4CwZkAuujoM,10633 +nltk/ccg/lexicon.py,sha256=KU-8BN7v3idl21hGJp17-RSCPvBi-_IZtnh1iYi4TlU,9863 +nltk/ccg/logic.py,sha256=MtIlnuQHLX87ZsZKuzWi7zHkuRZ6yhGIkqL_-VvnQt4,1871 +nltk/chat/__init__.py,sha256=j0MXWx6yx4kNWRrQvzEtPsUUcU_hF9bLehp9Rgh_v00,1556 +nltk/chat/eliza.py,sha256=59saygPlQ0-WC5q4qyvB0M5gXbGE6-sYd8Sb2B0cC1Q,9626 +nltk/chat/iesha.py,sha256=ujBOuWGk8bElbifa-AfLzPG_znK5A8uJKx5pG1MEL-4,4407 +nltk/chat/rude.py,sha256=dP9A5CfuXiahWdV-UEQ1_VPbyNwUV9igE6ytf-8rQJo,3289 +nltk/chat/suntsu.py,sha256=JVsEUdIT_NsV-Pvf_8cb4Ni7xTUizUU6_dhwZmglc2Y,7185 +nltk/chat/util.py,sha256=BDR9XGIZ3eaQZI7lmRV8rAlIFSuITymuGboaJ_0hHnI,4014 +nltk/chat/zen.py,sha256=ZvwCW-IwGXkB9Ow4E2X0_Me-AXyHzRysaPzvZTH-Q-U,11679 +nltk/chunk/__init__.py,sha256=iss_ZDWLksRyCYqYx7h1Lp-myvXGdum7ThheJEzngs0,7597 +nltk/chunk/api.py,sha256=tyChrHxfXT0igTlGFuII6OrIeVRorWkB2bfUBbEWCqo,1946 +nltk/chunk/named_entity.py,sha256=dyj2aUNqbb5jB9s-U5DIONzM9y3b4PpSaF0h6eqpcrU,11140 +nltk/chunk/regexp.py,sha256=kpZ4ncsCF_oeCRitIa0fQot4-e6DwhUx9k6xDzgoRQE,55980 +nltk/chunk/util.py,sha256=reEcP1_vA1xTeK9TvwuGUOlHWkly6M5hOITy9Vlff9M,21311 +nltk/classify/__init__.py,sha256=wzMuqlkUNHx599q5HiQlrw-mU2etQoAnDe7GByrhY7k,4596 +nltk/classify/api.py,sha256=qVj6-ToOO_moPF-nlJA-D1Sw3wnVimHa5-KFcHAHRAY,6625 +nltk/classify/decisiontree.py,sha256=QuVmhYD_91vAuQEQq2sqJ_u751YlBRfQsoT_LXDoen4,13083 +nltk/classify/maxent.py,sha256=rDWlzUayXDfmQCsiHI88W6Gk-pllpFQt9BUAkvRtpzM,60921 +nltk/classify/megam.py,sha256=f40MRdN2M0cB3_YtWcDItokUwf23-D08oE3niXm49Uo,6396 +nltk/classify/naivebayes.py,sha256=j--w3JzOTDvMDTdkkAKI6smSv85VTPkETmPTX-wcaZ0,10713 +nltk/classify/positivenaivebayes.py,sha256=WckMp6Olu6x6Ku__NCRVPO2n6WY_AI2yr1y46cy-IgU,7412 +nltk/classify/rte_classify.py,sha256=fR3p6_O84sP1cfIaJpTcRzYbQqEEARRTHvFTY12KwMY,6301 +nltk/classify/scikitlearn.py,sha256=_D3TQC-jxEn-eq3Y7Ydc1OczkhIAbednxDpcFpbj99U,5548 +nltk/classify/senna.py,sha256=nonby5yIMQ0ZRGKecnuMng7WNBjyOiyWN3Uhf06HlBk,6931 +nltk/classify/svm.py,sha256=JuUiBrudmC9G0ECpxRoH4fPWOqbo0fjk8QvC3isYWxs,525 +nltk/classify/tadm.py,sha256=uo5Q-wjqLQ5ig-2GoHb718V77AwdVQjPo20xe8bfpgI,3555 +nltk/classify/textcat.py,sha256=M1KXl9Qug1DKOeWB1wTXH_aPLOOlDpOMtY6UyR3KLOk,6035 +nltk/classify/util.py,sha256=qppSD7cGWrERbQlWKCJP1UqU26T1br5C_yv-n2wH0OE,12461 +nltk/classify/weka.py,sha256=MOu7asY_WGfqaka2O7O44Fc11Z0v1Nc13wyu4Mbg334,12938 +nltk/cluster/__init__.py,sha256=TeNhSjGdfbYPbr3TPwuBVB1whBT4qr5cM_VmIAf8zQ0,4361 +nltk/cluster/api.py,sha256=ibEhMyocllfIgq4YtM3r1tM0tzTQTQVdwIK08E2m7PY,2162 +nltk/cluster/em.py,sha256=wJk7pf0rQgvvycTKyUlipa_gtLy796abLB7C4fL13X8,8419 +nltk/cluster/gaac.py,sha256=Jb2f8hCxniGsEv75n8aZYyZQ4psu65GQXFV4JV_ZtqE,5921 +nltk/cluster/kmeans.py,sha256=3-HbAKUrOJhMDrrgfhVO5NBnVGac0uPBHEWgqGJEC08,8592 +nltk/cluster/util.py,sha256=K3E_c_cJXlkKtDUC_L25xRTFFJivTiS-3iy547BAnII,10039 +nltk/corpus/__init__.py,sha256=TDc-AfvXOrIxO-QPzsD_MypUllJ39bcv-evDjcpdwgQ,17359 +nltk/corpus/europarl_raw.py,sha256=bIXJnXmH5vdnD-k8ITGorcXUzQFojksg1JkWBgVBoR8,1896 +nltk/corpus/util.py,sha256=yhCSvPX8NsjV0UScWKI80ogkfiZrtnJw2FxIxBExAl8,5867 +nltk/corpus/reader/__init__.py,sha256=410BhPVmlOCo7NDn2A_RjpjcwAdhkPn0Y8XXo4iUYB4,6677 +nltk/corpus/reader/aligned.py,sha256=zKuYhx11Jk12bF5YN1zFrwbWcJbkzM03EN45mUQQ1Tw,5005 +nltk/corpus/reader/api.py,sha256=YbWlD2qHuPnt2SjJLYLP9QQwE1JtL50bIlS56-Ahyv8,19671 +nltk/corpus/reader/bcp47.py,sha256=UWTLQaKHfRUA21ddT27AE9RYipV2Cn5lYDidFT6RNdk,8529 +nltk/corpus/reader/bnc.py,sha256=_mZmdFk1Syp9S1-srE_AW1jv-O6LoTMdK-qSanZ3twY,9716 +nltk/corpus/reader/bracket_parse.py,sha256=FGUi-VPjcnA8Rh7WRX59DgAlDaqebNxzL2Uk3uYCqhE,9619 +nltk/corpus/reader/categorized_sents.py,sha256=jbHGmNnMWOX5oSxKSbvACG9r6ptfYLEZrdIhjCJPY_U,6221 +nltk/corpus/reader/chasen.py,sha256=Wb_UOMO_MRbKRECOpivtr0vaN6g3HWTjrERc85FcZdk,4699 +nltk/corpus/reader/childes.py,sha256=ptJGkkOyvxeNKgn3wat8u36C3TY7b9ctEsvmbbTr9SE,26105 +nltk/corpus/reader/chunked.py,sha256=tqUobQJLw7SWXAw9ZlrsCUUec9YZw0gZTgmwkdRBpcM,9366 +nltk/corpus/reader/cmudict.py,sha256=0s3RhLMhCtzPXUbevmbwzU5F0PETihcc2adr_HjVky0,3366 +nltk/corpus/reader/comparative_sents.py,sha256=vuvy1UROWU8w5Y3Kqk8N-Xm7rzkp24wzoW1AY5F7TZM,12069 +nltk/corpus/reader/conll.py,sha256=xXH1asejKfKm1bc7lj1WTzU9qXoHe5Bhbx-ePdbP_ZM,22301 +nltk/corpus/reader/crubadan.py,sha256=iBVR1gNW-8XyGOSYXOvg-F3cVGfCf0FT30QAYE-Z-WY,3627 +nltk/corpus/reader/dependency.py,sha256=m_UnaeHyAvLK1Fo5vxYo0ZLaNkzMM-_zQIZ0LNsKYi0,3890 +nltk/corpus/reader/framenet.py,sha256=EO70-dpHfV4b3YWQZ7Pn65N7c1strjncOgGzsYXHFp4,134791 +nltk/corpus/reader/ieer.py,sha256=ky0Q2P0F2GLXVUmVeupfpRaqtJf26kNd4w4bt33fVQ0,3802 +nltk/corpus/reader/indian.py,sha256=QD-D2panbZCZ1CWAp0nkCrzEanTZN5cdC78QVoP7eWo,3014 +nltk/corpus/reader/ipipan.py,sha256=CotFWOLHXLMpStxm0wKVuPGZz8wKg7nUtBvclZ9DgKE,13092 +nltk/corpus/reader/knbc.py,sha256=IZbX42XSRQOK7DhYoz4bXThQgtpH7W_mreMVAdkTbhQ,5787 +nltk/corpus/reader/lin.py,sha256=AXCDCR842JRFJCItEqqVjlDG5zjulO4pQTMp9gwbQxc,6654 +nltk/corpus/reader/markdown.py,sha256=Y9AeB3F1gbUuEK7HNjR6iwHDAYeneI5XH1rMkw_ubWQ,12028 +nltk/corpus/reader/mte.py,sha256=8p5iQIJOuxu9MOWa1xH1Iijj9J9KmcRUWh2vmcWSOqQ,14385 +nltk/corpus/reader/nkjp.py,sha256=9fzhBenK-oaxyyTS6Rbu7Rtx2bFvbT78ZyKh5_BvkAo,16332 +nltk/corpus/reader/nombank.py,sha256=lf7KI3QjubpqiYP-KygZ7TOp1usgOnJ_qpJvUVf388A,16247 +nltk/corpus/reader/nps_chat.py,sha256=KORqpkv_3NWo1p6PtmgyzfJij1I70hrdhCSZeOKkaoU,2940 +nltk/corpus/reader/opinion_lexicon.py,sha256=kh1s8X3yUIqwoejrOHxRoxs4zWijBhenQ3NGIBKkTew,4230 +nltk/corpus/reader/panlex_lite.py,sha256=X-wJ-ZNQwlYK4teoD-0kENwsVLmBPRiWZer2TZE6htY,5440 +nltk/corpus/reader/panlex_swadesh.py,sha256=VRkJSyZuxIldE1wh_a4zro6IkVc1QuV_MVszkeqBDGw,3287 +nltk/corpus/reader/pl196x.py,sha256=WcUrrLWN5STClB00S5ZxYaMmb1mldkOTUz1zZoxI-IY,12320 +nltk/corpus/reader/plaintext.py,sha256=7wNaSNhzBOuXAOdReZC9pkmbVsrEYB0o7CtKiGxcXuE,8456 +nltk/corpus/reader/ppattach.py,sha256=-IKIY--x9eUsrgeGQmjLU1TRbJbNG0SA6Z_14_ZJgTc,2903 +nltk/corpus/reader/propbank.py,sha256=J1NuKo2x6PtBKYc12cGyB12_9tK3ctLzTony94xA5ys,17776 +nltk/corpus/reader/pros_cons.py,sha256=UU_afhN0EkAG8bAUnluLm1csnUwxzlUEqg8AAnLE4DU,4896 +nltk/corpus/reader/reviews.py,sha256=oqWy9piVlwCloLrlI3maFWBZA6pc5PVb1eIMzudKDBA,12321 +nltk/corpus/reader/rte.py,sha256=Yd5yEjw4BO6mBmkviju4HA-PET_aJpG0hyDsijPTx24,4785 +nltk/corpus/reader/semcor.py,sha256=FjJZrOi2lf4wQcYmh_sLnoRTN7sS9CHcOYEbKgV7cyM,11694 +nltk/corpus/reader/senseval.py,sha256=84Eh-kitdfiI_kQoYivPL0lZcYTZmoP_jj_JnsQ_EwA,7539 +nltk/corpus/reader/sentiwordnet.py,sha256=ewnyU2Gh_1aqU3Z3j1-j_QienkJghZdUKuHIuOHklvs,4626 +nltk/corpus/reader/sinica_treebank.py,sha256=ohVwi9lcG1Gb9E6Isl_iiemnZyrUn-KYtdCcmCYjszk,2541 +nltk/corpus/reader/string_category.py,sha256=Dia2lk6Eh7PogHGyzJyZIi3lFSZojO9cpCVnh_1g1vs,1919 +nltk/corpus/reader/switchboard.py,sha256=xBVfJRnQp_XGcVFm70ft46qvZLaKfagGTe-uhSORjlc,4547 +nltk/corpus/reader/tagged.py,sha256=9BkTAHur8ucNuNF61glxyrB01L1gOBJdOeN0VGS6aa8,12140 +nltk/corpus/reader/timit.py,sha256=FpbCiufjEoKPZNgGjvUO1-dAhTq36qeCauaFIzqzRO8,18473 +nltk/corpus/reader/toolbox.py,sha256=hyAISZ70vZ0S1a9Qs6jZ-fa7gw40p_yoodXH3xLwg6s,2121 +nltk/corpus/reader/twitter.py,sha256=IEk9BNX06Z9dKxy6MEM61N8r68Kz-KdLgGib9l-hQn0,4608 +nltk/corpus/reader/udhr.py,sha256=tjqXc1JQiTURHnsTU8RIzwxdQIZasbtdMSPz4msnHVo,2592 +nltk/corpus/reader/util.py,sha256=1Ou0yxGtYjPghnkjZwDMPf38rJ15yDZK-6IA44rhTTs,32225 +nltk/corpus/reader/verbnet.py,sha256=4fun39GzP3O-cP42ZyQxNQIrer4VQ2Brk3x-5d9VIsk,25404 +nltk/corpus/reader/wordlist.py,sha256=opMPlNhreYExp6X_gfKDb0anIe7LoQGiEFn-yf5ybp4,5812 +nltk/corpus/reader/wordnet.py,sha256=sXfx_7uGRsVR-4TKY9fcpeLaHk-X-AcQqY6zOeeTX7I,93379 +nltk/corpus/reader/xmldocs.py,sha256=RazN7XWIRonA4cyhJIkPOIiFmuX0qMch8lKYFrofDjQ,16285 +nltk/corpus/reader/ycoe.py,sha256=9VbkO_JnFG2joiWfjsfYZ53vsSPl8lWfK00faIAaLN4,10504 +nltk/draw/__init__.py,sha256=vV9gZt9NCushZ8-2N2vqEukajTMiElnEqeZKL_-gXhQ,810 +nltk/draw/cfg.py,sha256=VzwqBY91F0EA3fGS1xSCMMc4PACkcLqqp7AmqrlBxD0,30794 +nltk/draw/dispersion.py,sha256=V_J207xoZ51B4MCS-c7c1IIFP2mgk3tM9b23mw7T8wU,1854 +nltk/draw/table.py,sha256=YLBqgWTHbzCGt0KiVsMeyN2Ikp1hc0LMJ_LwcMwQ9Rs,46257 +nltk/draw/tree.py,sha256=pizGy_aCBJP2lMaqwxj4mLXtfrWlZIBVAbO6PR12TKc,39275 +nltk/draw/util.py,sha256=X2jcM9ISQ0XoJjbat5yOxbaIIyFcOKTDvvOkFWOnGFg,90944 +nltk/inference/__init__.py,sha256=9_R9E3qWRp9Xngl1_SreRCqKSJxzstV9dRgGm35dxeQ,814 +nltk/inference/api.py,sha256=GdomkZQT97b7Z_HER__KuhCDehRbT1jxD7MoV-1COnY,19560 +nltk/inference/discourse.py,sha256=XojyiwkuvpTjBOZCcQ0q5CBpIMOw5fXw3eNzjZPJoqw,22691 +nltk/inference/mace.py,sha256=EKRZuwCrX320jzWrq3NOFnfIugapuYdgiH31dVf2ZvE,12243 +nltk/inference/nonmonotonic.py,sha256=o3mnSO8D3KE_uPlUz6KQi6LAAU0mf2wSxhTRGrqtojA,19174 +nltk/inference/prover9.py,sha256=Xhleh-zd-sa2AqeFtO_tT0MWbl7XazrF7Z5MAhZTTWg,16266 +nltk/inference/resolution.py,sha256=IUb_tYvevriFPfmMwWi_tTNw4AANT5l6gFg_WrawD90,26761 +nltk/inference/tableau.py,sha256=-fzKyG0biJXA8wVNtBkBcg4YolCTyO9Hz38ZxU4yF4U,26320 +nltk/lm/__init__.py,sha256=521PU3HbL6kXH2MkQcgv9gJu2WBAXiaTxtuphcXpJGc,8051 +nltk/lm/api.py,sha256=mxYxQ9ATm6t388mIOEIXrDN69eoysgovs5h6WddMC8s,8495 +nltk/lm/counter.py,sha256=tKEAIt2HyUIovqdC3WqZYAGIHN25hsMf_2qnJrGBB5E,5250 +nltk/lm/models.py,sha256=d9HfqoDGEp0ewWj032LnmSYW6sFjYFZESTaMU93rI4I,4903 +nltk/lm/preprocessing.py,sha256=UTmTQiv9i2f88XAHyyJ-xoM0lOuS3x3ebo8RBkeAZcQ,1714 +nltk/lm/smoothing.py,sha256=J3C3WnYMPNgPtt128O8iqgre6u_vQjaxDqcLzRMnr_Y,4745 +nltk/lm/util.py,sha256=Ap8EJo79SpS1nmdjl4gYa3rZl1cvRWl7lEyNcgGG85Q,474 +nltk/lm/vocabulary.py,sha256=iFpZC_HgU_TUdmr8CbkC3dDaOA_faUrGesrKX7sNGtY,7099 +nltk/metrics/__init__.py,sha256=f_dzvVUT8mHgn-mF-emEmYuvend0lsTn0wibaK4zBaA,1243 +nltk/metrics/agreement.py,sha256=cUblpWaXsxQiceYXyzs_c9xnupKr-Ujtlp8w8cCGAhE,16421 +nltk/metrics/aline.py,sha256=53MBMrgaEot-bVkwGh0412CmGS9ZIJn4Ijr65dMRV6A,32845 +nltk/metrics/association.py,sha256=OlLUrFM_mCzgaoo8MVc6ymDZ3jAv5cGCFe9yAteq4aQ,16569 +nltk/metrics/confusionmatrix.py,sha256=YEUnqMXuoSd9bbnqTT9Dx7UE4bzu3rjg_avyPogd8GM,13035 +nltk/metrics/distance.py,sha256=_SahvONb7y076LDBg1f5I-97ceuEgDkiwhPTTaY6vYI,17661 +nltk/metrics/paice.py,sha256=QA3lWpIojKdQ9nJP_hfm09fiQbvKH8shoRYPqLLT_Go,14739 +nltk/metrics/scores.py,sha256=i8ZpEAN78nGpmE-lAswKVTY-CyXxlgkMrwEZD1kRhIg,7922 +nltk/metrics/segmentation.py,sha256=0BPUZJp1PISVRhUnwujhlIplb8mcv7RAiaf5Da1FUsE,7221 +nltk/metrics/spearman.py,sha256=xcWuGVDPwiAgmKP4kcTje8Q186z8edtznClWGwjhuT0,2197 +nltk/misc/__init__.py,sha256=9gir9D-vScmgbh-EK-prbtkJ19z6IvKAwsa9zSGakTs,406 +nltk/misc/babelfish.py,sha256=9UkSa6l_j1BHxaT9CU1Viv_51RB0k9AA--3ytmQpsAk,361 +nltk/misc/chomsky.py,sha256=UatgZu7Zj3W5DQzuxHZW6Zyxv5kH9l_W2u_ZnOCqXcc,5319 +nltk/misc/minimalset.py,sha256=w31Jhn71QMvzpglkc1z-aqICcB3D0JyEEd94BfiMRDc,2979 +nltk/misc/sort.py,sha256=X5Uyuh06xtHwnEBk1R5NPF0MAJZUFQxz0F3Mjl-R79Q,4547 +nltk/misc/wordfinder.py,sha256=QsIvlL-6SRcbIK1KcRx0dE9f1LZrRrNOZN8nnvRUUQY,4352 +nltk/parse/__init__.py,sha256=Wk7FRZKwkzOXJ4-wxG5ebPZiervEQUCFIoFxLg4V7eo,3797 +nltk/parse/api.py,sha256=zgidZCV_5T5HxOMU0heuAtMc2TY2zY99yQgE4yU8hA4,2354 +nltk/parse/bllip.py,sha256=gVXcccYuz2x0jGhmQ3WzW1mE2yUEktQCceXULuIlnHs,10976 +nltk/parse/chart.py,sha256=bcickcPKBQcy1ugfIgAQGFJXe_pvYg1cODQHRhpB8yE,63760 +nltk/parse/corenlp.py,sha256=-vLVg1koWGvJtQ7MjbigllvhKOTbwMtk4Yg7EVNGCQk,27745 +nltk/parse/dependencygraph.py,sha256=vQxklNSAXKoPrNY_mne6-SDE6HL1tpACcpmbdGp0Pa4,32468 +nltk/parse/earleychart.py,sha256=E2xNIYwg5uS250A7vEY6pg5CsmN2lG-iRTiey9Mcc0g,18274 +nltk/parse/evaluate.py,sha256=kkKxaqcXgpqtB5T1ByE7VNEqsXqLf4egOFzBic8jMpc,4468 +nltk/parse/featurechart.py,sha256=u4akETKx7QdUR835-SHUIx-CydDhMK0tzjgfrR4CU2I,22532 +nltk/parse/generate.py,sha256=4haxXNHbCSsdeF22fTZxRlNWnQZkIk2BMTDY6FVdqTo,2381 +nltk/parse/malt.py,sha256=yzBztqRi_0t9H-vbYiqFqoujcSnFYaYXBzchYpkv7Ro,16571 +nltk/parse/nonprojectivedependencyparser.py,sha256=U2wwivZa9-xdsneyuT1z98VhCRTaW-3GG9bSpeg0mwU,29446 +nltk/parse/pchart.py,sha256=uJ798hhFYiyHA4upmPA-4cZLdfGKcafV4ppEW_s0boE,20480 +nltk/parse/projectivedependencyparser.py,sha256=thEULJ6bVv1NBx9Di0ZIUpdk1O0rkg05vP0wHBIlWMY,28243 +nltk/parse/recursivedescent.py,sha256=5Y5wG76JmeI6_7rct04SE5MGYWLI1es94E7hNUsGvz8,26032 +nltk/parse/shiftreduce.py,sha256=_bNgIDqa6KSA7VZOaKrDFLThwZ2GBG98UuAq2Nf5bOI,17071 +nltk/parse/stanford.py,sha256=T6Iy6EtAjWK2RzCErUytvF8TBS3BdgtezuVrS4aJJZs,19312 +nltk/parse/transitionparser.py,sha256=G7FgqoDnge7HssR8t_FlbUJfGcVnbvLU4wjmqgmuBew,32272 +nltk/parse/util.py,sha256=uiUuipkOhOrp5HoAyrfGa6BSqVH7eP3GFDJyq54j6dI,8667 +nltk/parse/viterbi.py,sha256=f5hYaHqULHfaHwNrpXZgmX6ME3PBK3eyMY3kuJTA3dE,18351 +nltk/sem/__init__.py,sha256=p6Hbjz2ZPXuflzE8pkU4rWC1QWyKmNXEBKBRieGGeGY,2443 +nltk/sem/boxer.py,sha256=Y-3NmB2tMBBMNHuK1KGxAadHtvUFe0U6fnQhmvWvusw,55287 +nltk/sem/chat80.py,sha256=Guq_aG6zu7ADoA_WMIHWgGpLBSf0yUAEiOjJGF4VLMs,26519 +nltk/sem/cooper_storage.py,sha256=FNR_T2rsGS-ZcxOnQ2ZPyqC6i5FTu-g7RHlWrgSSvhg,4210 +nltk/sem/drt.py,sha256=Ld-l56GCSW0hbjlsaHFTdgdWOu1G-kcwgaL2vP1htdU,53149 +nltk/sem/drt_glue_demo.py,sha256=aeXWCbNS271u6CAH1PRJCvZdSE3hDwM97DeSd7u59JM,19171 +nltk/sem/evaluate.py,sha256=mHeYJqE0BVqPxSwUfkeGAQ1bh9Y5LRk7w9_f7439GcQ,26282 +nltk/sem/glue.py,sha256=ipOnduCYGZ9H2YjnVNPsvbfxy3pDWNufn33dcp4IQeQ,30254 +nltk/sem/hole.py,sha256=toM_3hX9_TPl8yBOssXb3-RaFnxnok6pqvUQSumYCUc,14216 +nltk/sem/lfg.py,sha256=ESvmusXvd9y5tC9BdKyUM_5FooHM4w28SK0YRzaP49M,7716 +nltk/sem/linearlogic.py,sha256=Thc42mvfC9NMvsZonxk7KbL-He8LCFsl30E9SxO0U20,17234 +nltk/sem/logic.py,sha256=FjMyF4FzvdW1cdmkSM_-sqysopFvOA_Mgrfy7Jochdo,70239 +nltk/sem/relextract.py,sha256=melBHw9hbMLiOBoZXa1RI1Ojxy84bNDIP94z6_c2Q7g,15809 +nltk/sem/skolemize.py,sha256=q5IT0Z7lL9sj8fT9PguEwVtOLAvm-J4e36Iw_Wd6z1E,5870 +nltk/sem/util.py,sha256=FcqkK4o7oSt8fXNp4E0ha707veWvQdZFMWlnlbNZeUQ,9062 +nltk/sentiment/__init__.py,sha256=bqru6k9E9lRynVySxEDC7conFrVuPOufcmoHySqm9W4,382 +nltk/sentiment/sentiment_analyzer.py,sha256=ClK4n3ws8r8mrI9ZkMeKHIgm_cellsFYz6lCX9UQDtw,10432 +nltk/sentiment/util.py,sha256=szZfIeQhP0J8AYfC4vNjqmRZFGRbHFPsJdO6Zoz8H50,31275 +nltk/sentiment/vader.py,sha256=WJNkeMBvKchqZFkcssgDnoQxsjstvic5kbpPwQ9swG4,21428 +nltk/stem/__init__.py,sha256=5xo4F2TkiPszcNPK3zqE4mirbEfe4AqWj692l2PV5qc,1296 +nltk/stem/api.py,sha256=bOlMmoUdVllph46QpQHx2HzNFKXdKjm9tHHEWEgDtIE,741 +nltk/stem/arlstem.py,sha256=25d5vA5VJDbxDwBYJVeJf1Qytw8ZFYUaxXYXFmu-Qqc,13006 +nltk/stem/arlstem2.py,sha256=JDtsINE-wtQjjYP8LfbL5fM9EhHOgoYfxIyUZ7X0W2E,16535 +nltk/stem/cistem.py,sha256=NguOtA7aV1XZSRnWZQwqTSguGxuuv1eV0XANupgD424,7259 +nltk/stem/isri.py,sha256=urgXJidrtKJ-UcgF2nNUMUAiTK7XDXr8W3BXuVUu3eI,14990 +nltk/stem/lancaster.py,sha256=44FMZudf9FZCw28YrpCmEj3MIyKNx5Upn_VBOXJc854,12587 +nltk/stem/porter.py,sha256=c5jYrt7IHEpV-neeZp6zHOjLXYh83x0tksE3VP7YkRg,28372 +nltk/stem/regexp.py,sha256=egNNeMRQu_qUXyLkmmIEP95k72E6Hw58W8nqiHNP-4g,1578 +nltk/stem/rslp.py,sha256=ewvKK6D6zhVfUI7mfwp_b37R4ZZTTQnXC4vMPWmnx_I,5511 +nltk/stem/snowball.py,sha256=XisoC_V0UbtbiY3ZbE88UqRDd-5TMGJIegA--jVW8VQ,183890 +nltk/stem/util.py,sha256=SGkZ9WKxcS93IQIFONUiRj4twmqO19gqjDOeNyxFQJU,644 +nltk/stem/wordnet.py,sha256=uc0ghev-DoXaFKMJVeYXBRLMhSRIPUDJjPVx-vNmNSg,1655 +nltk/tag/__init__.py,sha256=BpW6B4qJSE8VsqWt9UDij_gyCE0izoNGOKxsajQy6UI,7298 +nltk/tag/api.py,sha256=UCocDhZufc3R3Q_xxftjfqjeax2jfkCRvN51reIbG9A,14810 +nltk/tag/brill.py,sha256=DO2jEQYkgUUo88NnbPj7g-nyYuVQctxe5ix-lwuNMdo,16829 +nltk/tag/brill_trainer.py,sha256=ba0A2b255xtF3N30iiLWy8oazQycJ3IP-gBY6SMuX2w,27900 +nltk/tag/crf.py,sha256=dxoeV_vdeVCD-UjnqywALEoROLmsXVBxOHfT_DKQB1A,7834 +nltk/tag/hmm.py,sha256=AuOCEIOpF-FHXBP2qCn8-rAuiVeV6JRPQAYf6NO0KJA,50349 +nltk/tag/hunpos.py,sha256=m-osO4GrP0M6hc2oTT7ZIY2y0xavBNMLzZNubjy5aO8,5195 +nltk/tag/mapping.py,sha256=VTdSr4ICvI6O0QwKIljAe8LfDPUNr3oRtDDX0nFikww,4024 +nltk/tag/perceptron.py,sha256=ge04T_6s-qIk2ElRLA43euMSzbaxQur2KxosuLfq_Tg,13425 +nltk/tag/senna.py,sha256=AuLHt5oEuWb5Lo6YzXzGdfIUu_f3BFkXTnDxS4wkmrQ,5903 +nltk/tag/sequential.py,sha256=bro6YLUoOuucHCBbEX6UKOtmtLDsxPMcSlNrObOCMRA,28621 +nltk/tag/stanford.py,sha256=1d3at71jwf8gVy7AmGzn4-i0RB3HyzR2LZP5qUUA91Y,8427 +nltk/tag/tnt.py,sha256=5Hc2JCVCZsaGxP4TsZ5WJg8wTDlQbg_xA3NlQ6wKRdQ,18432 +nltk/tag/util.py,sha256=blYIFguiOZa362hnydekH-jYN5HXGkfW0-fCKd8xiNw,2353 +nltk/tbl/__init__.py,sha256=MPTjTiMXJF2Y7qIu4QtNGJD_84U-okTijuVks7CZUb0,790 +nltk/tbl/api.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nltk/tbl/demo.py,sha256=fVIVKfJ7KMNAXrJOmhixLNlmwpaIKcwRzssghsC-W4U,15338 +nltk/tbl/erroranalysis.py,sha256=ikxrW1gcyyzY9fEPeJU2wXJzbdQDZPtj5vcTdC13py8,1454 +nltk/tbl/feature.py,sha256=iS0Oztx_ezjEdGk9pKxLTMJXZSue_Hy7tHmq6aCcyB4,9690 +nltk/tbl/rule.py,sha256=moWZl8BU-OVIJpgnte_DyRMXnUTAx5RqWQh_LjSXiMQ,11515 +nltk/tbl/template.py,sha256=PQfVGnrdXUEVlev5c8oZyhNDaWBXhztzj3CT4W8qB_Q,12892 +nltk/test/__init__.py,sha256=4AjtoEMQTi2Bon2vfiOnHg7CaPAYXNmGgar0r2NyFBs,487 +nltk/test/all.py,sha256=Ojl5ZxWh7LV7XsdOyR8kWEOxMk7E4m7QnCUrQoACzQk,819 +nltk/test/bleu.doctest,sha256=2LmQDiyg-BueBx8LbLJanXCs7UZI9zsLe5nurrXBVBs,862 +nltk/test/bnc.doctest,sha256=RGqVzKyC33SbcmJ8DIu5CGAjhB9OZbNCPiQmE86LJzI,2051 +nltk/test/ccg.doctest,sha256=-cJE1rAIPAuCBf9IwIZJUKBVYR1ehYQ7BEx0d_rz6Fc,19786 +nltk/test/ccg_semantics.doctest,sha256=TCI5Ehwy1a4aai123C7MiggvkQyWh6_g8KCRsNYVe08,31066 +nltk/test/chat80.doctest,sha256=aTd81isXYYxwNBuubVS-Q1GCSSL1LP52y9e1RUPKN9c,8735 +nltk/test/childes.doctest,sha256=FcKzuX_RXXHZNhCyU_S2B7xTN46R54mxjYs17bejhN0,9363 +nltk/test/childes_fixt.py,sha256=A04g6D3QytqCatGOYAg6fCXqXT8XyjIPbRSkdG1ys8o,372 +nltk/test/chunk.doctest,sha256=LinetSJqiBaPbfesFybevL0YTh79toUeDd5gNgcGwPU,11511 +nltk/test/classify.doctest,sha256=eZqVib7QvPfhsj6osCAP27kx0-M_qJBj7y1djMEI8a8,7699 +nltk/test/classify_fixt.py,sha256=zUj2OUNhyi2sJNmVdShbQnuhLUieJ7h_AwKmvj1Gnzw,119 +nltk/test/collections.doctest,sha256=4_vVRoz5oloCW2vBO4WcA6wk8XJ_s5wcDvB6wZ2N7PU,622 +nltk/test/collocations.doctest,sha256=_y2c0VNVbEZVbhEzR6_E4hUabgP9E9g1KWoj4RaS3RA,12506 +nltk/test/concordance.doctest,sha256=rPESoW6-ugpgj2LgBYgChcjOWv75x0eu3Y4A2MuZSsQ,3544 +nltk/test/conftest.py,sha256=apMfe_V5EHXZHCXCMxVqMt3gpHCUwwxR-MxESis9--Q,804 +nltk/test/corpus.doctest,sha256=GURLSCZMaXO4m60bW2UxVGnmaPEvaEqaXbgq3tT7SvY,99206 +nltk/test/crubadan.doctest,sha256=6tPhRHgxV86EgQ8ERpDAXc2ALzcwULkGOrvpuEPZx0U,2060 +nltk/test/data.doctest,sha256=fk1EXhv8iJaDVdCKwd990WkLcjBuY2KKMjlcRhf0L54,13997 +nltk/test/dependency.doctest,sha256=2cSXwAOK7THXgQC0sb-h1xRV_OXZlalvwgNkJu5I8zE,7669 +nltk/test/discourse.doctest,sha256=Vfg4OAAyW8MG79CebO-9UAG9aE2FmHkKLIM5p9Iy3_Y,17923 +nltk/test/drt.doctest,sha256=O07H5I5L-_dO_CXDL1MwPLUNLgoxdAPtb5UuJ9WDBME,20076 +nltk/test/featgram.doctest,sha256=LgXnXjWEUD32xNRM96u1uwwlLWJ5m8x7NXTLD7rtOW0,28870 +nltk/test/featstruct.doctest,sha256=h77AjizdnQNUNWr_vKMlUSTCXA-eboDE88ll51otp2k,38894 +nltk/test/framenet.doctest,sha256=MCyYWa_3NLu_FaVWjEMyiqa1requinJhDJYYrABSzCg,10797 +nltk/test/generate.doctest,sha256=AiyqziIJnGtWWrVh9CYvtE54tvxgN01ykn0uETWtw-0,2050 +nltk/test/gensim.doctest,sha256=3fKuOarl0dxK_uP5vc4cqa3sZqhYPHQb9eJx8O6ACro,5200 +nltk/test/gensim_fixt.py,sha256=2p-4RObBWBU-YzbGPXAj03aPhimK-l_RLNdS_RuKHh4,77 +nltk/test/gluesemantics.doctest,sha256=20XI1C4RjZqVCVqMS-1YgdeJ_Cz3KN-P1qPYG2qXKiM,12705 +nltk/test/gluesemantics_malt.doctest,sha256=rcmxrg6gW7_7t4kLjA7mWMedy_n2DvFg_EGkOk5RYVw,2667 +nltk/test/gluesemantics_malt_fixt.py,sha256=H7YUsT_M5LteTBF9utPjqUP8pybUaW3w2fQdGUPiT3c,232 +nltk/test/grammar.doctest,sha256=_nCSNPbuqgVmCf6os3_yc0fPRYWjuwKrX89kiLAVbLo,1949 +nltk/test/grammartestsuites.doctest,sha256=6XpAsAg16vJelvF_jhzeiM5SlZrktRshTE2VLPKsvPI,3309 +nltk/test/index.doctest,sha256=RkJVKjSOZQDcsmUE7UNetAphh7uAYEf-MJnTXeL9-yE,2701 +nltk/test/inference.doctest,sha256=eMpj--Sqy28Ib6pJhjSAlEiawwjI5mWoyE8NNyn8-Us,18365 +nltk/test/internals.doctest,sha256=2mKzX8ixuJnslzSIoJskOO9EOwYhWJvuarZNisZnzfI,4283 +nltk/test/japanese.doctest,sha256=YK7NcQroMmbipzOs5B6RKZnwgb3Ve2ii82-Tp2lQn0Q,1093 +nltk/test/lm.doctest,sha256=y9WQdrC4YbD9nAlT6xtRF1w1fCp_cWHtOIA8Ef89vds,3951 +nltk/test/logic.doctest,sha256=pytqmHGGE7Ci4LT9epskJQqpONJV5LW4aoG8Tp9GF04,35183 +nltk/test/meteor.doctest,sha256=cfbnHWYgPYXDGY6i_ex6fUGfUFp3yEBpR2A-uwAJ38s,1523 +nltk/test/metrics.doctest,sha256=VmQgjLNH76mQTLkrRMauAKLmpZQCs_lWv9pxr5nBpHA,11283 +nltk/test/misc.doctest,sha256=9_0lJuV70Z-Go6040fK0FDvrntGmq6OnvJ8hc2KIkMg,3464 +nltk/test/nonmonotonic.doctest,sha256=3T2RDpiYNEOMQ8ATSbVIPAGtzox-Dl0H1MtSE-ozhKM,10370 +nltk/test/paice.doctest,sha256=9KOoUsd6O-ACSMbt3Ras4zS-FE55R4jZ6xh1JKopb3c,1273 +nltk/test/parse.doctest,sha256=c3Ub-rTrdmOfARot2lAE9E2gPnF66_1XOMBH2f4A04o,34936 +nltk/test/portuguese_en.doctest,sha256=72pNCFzJLgiRijWkPiBEiCJn9eCiBJnlultXuUVINxU,23121 +nltk/test/portuguese_en_fixt.py,sha256=-66oHXFBbvDKHkwkcOLnpwcn29iUpwUEWrJ-LqSk5FM,130 +nltk/test/probability.doctest,sha256=4utu6TrRmjha2-KaUWIjYSYRhVNhHvrmMLm7luf6ML8,9244 +nltk/test/probability_fixt.py,sha256=avszs9PHMTVYHHcOXA19-EsTYDahH4VPPPMqv1QkGpE,188 +nltk/test/propbank.doctest,sha256=Fb7YeHYbKPrbpWCjMKYGqcCO95igSbv0jQj1ypsOSEI,6694 +nltk/test/relextract.doctest,sha256=E7ATLql5ii9di5oDrqdRiEZQwpQJ1hcGi8I0duMTZ1s,9520 +nltk/test/resolution.doctest,sha256=04ChMU3moeEIJ6LmrnCIxXAYuiL2xpeStafqEzrY53U,8010 +nltk/test/semantics.doctest,sha256=dsqkf26iKaw9ZG1prpqV9oCeEABiUXz2O2vds7OE0ng,25190 +nltk/test/sentiment.doctest,sha256=8jahboQCGiGt7DRtyR0JXc9iEiHIeKRtqFLDWnEbw8k,12229 +nltk/test/sentiwordnet.doctest,sha256=rBGLCjzhyDgjG5266_x2NyznUY4Yh_rdZh0fxlPOOI8,1051 +nltk/test/setup_fixt.py,sha256=IQUyYM-mNaVbfsGFvfOdJc0ymAJ-0u5OAZd2cqRgF0s,912 +nltk/test/simple.doctest,sha256=YeUQSFpINBfA9XcfPdzws6_q8uIbbNUSd7yRVtLpNDg,2407 +nltk/test/stem.doctest,sha256=6-Dq9kjXDQ4vHAz_z72rD4v8qnPxPMcVTJJsPoQ9Sog,2552 +nltk/test/tag.doctest,sha256=azWCy7FlgyIfW_LtGgnAoR4f571fDVxyF4aCjWwrh0c,34100 +nltk/test/tokenize.doctest,sha256=6gOvhCdGQXhE9F6Z5BsR44dEOxXaz13wB4hsQCwxL58,20353 +nltk/test/toolbox.doctest,sha256=aemchSfvmSpyREYqErRZjBIcEIFmBMfsk03zJnzabY4,10323 +nltk/test/translate.doctest,sha256=PicRIanA66jTesEOc4zoVDLlu91wc8x8ZfN0QK-ZajY,8396 +nltk/test/tree.doctest,sha256=9e3EF3HGQOD2NHXv2Ytp2CDIGh0fj_hyNxUJJ5RKjRQ,47273 +nltk/test/treeprettyprinter.doctest,sha256=v5KbwNElsxuojmbLTKTDCnQqKXIwuI7fLw8VcDEQUaE,9376 +nltk/test/treetransforms.doctest,sha256=nD-odS5_kVEFqohLE_par75I49LQRY82aVbWYDf8d38,5006 +nltk/test/util.doctest,sha256=sHqm0uv3ZuuLPEc4CH6SMAicYpSjG-84bxvWcTKttcs,1058 +nltk/test/wordnet.doctest,sha256=RaFQlg8PeE3FaiHCu9lGUuvjFMULje6IyTELd4ziXG0,30528 +nltk/test/wordnet_lch.doctest,sha256=S1yaEnkJOlCp9kfWICXOwF-TK6lHyoh5p7hWwRPvnmk,2361 +nltk/test/wsd.doctest,sha256=4TGNVYQ2RAeYhq0uN9RRt7S_KF2T0dJb6XC3MkIbX2w,3014 +nltk/test/unit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nltk/test/unit/test_aline.py,sha256=u9y3D19sJGY_VE9nSOOON5GMYyTIRGVXMyqveeMypyU,1130 +nltk/test/unit/test_bllip.py,sha256=rm3KrL9F6F49WfXDV42GpuPlvbvsXibTbUXU0pWa-pw,1115 +nltk/test/unit/test_brill.py,sha256=ZTDCN3y2mTZ-dLeL0GwvO3yM6LlcTmIQ1LaNMBtXZyE,1024 +nltk/test/unit/test_cfd_mutation.py,sha256=YutOmFGBizpUzZ61l1cDxIegJah2ntuwKL88C49-WBA,1373 +nltk/test/unit/test_cfg2chomsky.py,sha256=g2wKKjmogJpAaRiLxA0_xw4KLwZa4zvjA-gBMt9z1l0,1726 +nltk/test/unit/test_chunk.py,sha256=xo-ItBtJdBsRIt-rX1rYYkV_ufABgPK4e6HlogRTvWg,2219 +nltk/test/unit/test_classify.py,sha256=4Bv5-rDyrjDGdmRHY_qh4Rq_5VdoghzlJEPB_PCIzQo,1337 +nltk/test/unit/test_collocations.py,sha256=vaiBeImr5dCDOhFPAN8Q4CAUVojJTwNHpodolyIymCU,3690 +nltk/test/unit/test_concordance.py,sha256=91x9LT-875n7OOi3it5O1qr1xKdOA1ufZY3KLH10Iaw,4108 +nltk/test/unit/test_corenlp.py,sha256=doMoc3Drnl3fDFxTaZKVPKq-75RTXOGSOqkI9K_74FQ,58632 +nltk/test/unit/test_corpora.py,sha256=46IA2v_oxDRFQQGa6iGTHiTHAPsDZjIovDSG2432Ubc,9923 +nltk/test/unit/test_corpus_views.py,sha256=mIxoCvqWSfInEQkISPwfZvTG6dTxYh7Bx0kCGC6VsoA,1600 +nltk/test/unit/test_data.py,sha256=y1fXWnIylRrff9fBJBUYZ6xw3T6uwMg_6View-jKcas,390 +nltk/test/unit/test_disagreement.py,sha256=e2JIXrNqCg1YTSh6P2lnGs9YN8KmkWcFD-zcZPsNkjk,4461 +nltk/test/unit/test_distance.py,sha256=DIMhkfn2y6WvsiJRyw1y_T5b_4OHI6wG01eEAt8Cd9Q,5839 +nltk/test/unit/test_downloader.py,sha256=QvpnRVehOfLZVJ-iUH8m5mEHG8w4deKxRhF7IOnjAZM,741 +nltk/test/unit/test_freqdist.py,sha256=I6qkc8zleTMeivGWB0NntBVQDx_tVxthWRwcOB-T0i4,210 +nltk/test/unit/test_hmm.py,sha256=bX7fSFd7k89JCr9VNFr1ZAng4m2KkfuTL_M2TNvA1nU,2285 +nltk/test/unit/test_json2csv_corpus.py,sha256=BxcrONmrp2Rsn9oq9C9cBj_rGFfPTQV9bJ5587pT_WY,5888 +nltk/test/unit/test_json_serialization.py,sha256=CfpHkTvY0lF8rMQXQsv_0nSVhDfhxVkqDwTLq26pv5Q,3634 +nltk/test/unit/test_metrics.py,sha256=iK6bLxVi1fVll-2eCmgzE-ubWnQlFeQjP079qdiRP-A,1949 +nltk/test/unit/test_naivebayes.py,sha256=a_tjsQsyvPIsO3mrtmN6knaC9BFwPE7PDNHBSNdhYMc,764 +nltk/test/unit/test_nombank.py,sha256=gIgs6vlEI2NheAh8c6wlJdk6apHmAMmaDZkP8laIvKY,760 +nltk/test/unit/test_pl196x.py,sha256=C41qhbllNBqtVJ9tCFM8mReQqzsdbM7uoMo9hFVHKLg,410 +nltk/test/unit/test_pos_tag.py,sha256=5HkW7hpjZd2270RVSFXECLxXg8jCY2iBViDoDA8O2Qs,2782 +nltk/test/unit/test_ribes.py,sha256=DItkydO5d543kRFYiAebnqudiF2HETHrMAntG3H75jA,5204 +nltk/test/unit/test_rte_classify.py,sha256=oNGw78oedct_VpwelsMVFb7v3bRFepnQWWbHgKp3GBQ,2765 +nltk/test/unit/test_seekable_unicode_stream_reader.py,sha256=XBxkic2HcfqxfTY0XxBBBRNEo5FQrYYQzkg1vywwUA0,2265 +nltk/test/unit/test_senna.py,sha256=fuLdpQO7kG-12rWpGprIOiH9fwxhv1yseNxKtpcUmss,3712 +nltk/test/unit/test_stem.py,sha256=kjtoZlKkgtCZYX8kxyVQIPf5f6QSPzUCLkCJLDvDWFA,6347 +nltk/test/unit/test_tag.py,sha256=h7YztNxvYcx2177MkQrPqPgYR2gL1sdl9YB3ZMKuciw,535 +nltk/test/unit/test_tgrep.py,sha256=qsr6de4fXwALqnhURZ2cRDgcGAlP6YBSqvNQdjDHUq4,31708 +nltk/test/unit/test_tokenize.py,sha256=9uQx21Vs5Iv5mBmNyiHqTaOmIecSlD1n9jUY9dF1mBM,30921 +nltk/test/unit/test_twitter_auth.py,sha256=bms9DQ07DwEr53IqMr49qGL9ria_1rEf3aA7xt8oR-A,2509 +nltk/test/unit/test_util.py,sha256=UMUTzBJRSSAdFwp7tZkG7gygQ9gHcrk2IEiuq6XvTRA,1888 +nltk/test/unit/test_wordnet.py,sha256=tZCn_lZVJ8POuehMbAIcgV000vCMwXFJUbdhuPEOSmw,9260 +nltk/test/unit/lm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nltk/test/unit/lm/test_counter.py,sha256=V4Bd8g-ZZsxxCRjS60Dv4w8-YCEUjktmO8LE2c56hhM,3891 +nltk/test/unit/lm/test_models.py,sha256=AP8cTYju5wG0YPBwtgJdctIjl7-pzcG3j0qFNDNdONI,20160 +nltk/test/unit/lm/test_preprocessing.py,sha256=KCspSaDIezeH8_CHB-T-fKEYpsdt-OHyaEe46-sIdlQ,999 +nltk/test/unit/lm/test_vocabulary.py,sha256=aX9HXIn-wn7mK5flojHfnL0652ygxO539LcyKC3shIk,5917 +nltk/test/unit/translate/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +nltk/test/unit/translate/test_bleu.py,sha256=YpYHqzzebFqF8M-x2AjUYeI3YNmRSseJmAf4xoT13XY,15874 +nltk/test/unit/translate/test_gdfa.py,sha256=E4r6o0S2r5rzK7NPFTyDA2a7IAfYzmIhCU4WXB0Wvdo,4770 +nltk/test/unit/translate/test_ibm1.py,sha256=7dGtPK_T9qXGTmB8skOeJ_mNJ2G8VaoYOlqMkhP0fBs,2669 +nltk/test/unit/translate/test_ibm2.py,sha256=fAjggyyMHRzPmpuFdov7dWfwsNM7hK9Z8p_qalCn_lY,3377 +nltk/test/unit/translate/test_ibm3.py,sha256=9PUQNN75ITw_TgzWayxQrMZRNS6pdyNThSw-sTdscP4,4189 +nltk/test/unit/translate/test_ibm4.py,sha256=r1gfJCmXlP0UZhZdrdzc0CcJLZNCn0zrn_BIMH0dVDk,5209 +nltk/test/unit/translate/test_ibm5.py,sha256=20agaTpArhfMcx-Ady0BUXyxayBU_ipPiDTvb8s_1oo,6761 +nltk/test/unit/translate/test_ibm_model.py,sha256=qTMFR4acSkEP5-kta-9B6RymoswEIitV3ljn86riNCo,9676 +nltk/test/unit/translate/test_meteor.py,sha256=sldeMjDkStoMnxBnx1MKDRNBGmcs4Hdu9VmMSzpl1Jo,750 +nltk/test/unit/translate/test_nist.py,sha256=HFfcs5Gq_goyYm-NSqdb_Eet6kClibKNvcr3gdasMmk,1645 +nltk/test/unit/translate/test_stack_decoder.py,sha256=6d9ASzQpnmNq51Aorm8Fr500HF1F8sJ6yph5aZ9zPXk,10000 +nltk/tokenize/__init__.py,sha256=l-h7tQHfTAIdEjUB5VTl0aAXgt84qfxoKMV4iZT1skw,5243 +nltk/tokenize/api.py,sha256=QH0CT0fM0ezdQc5zjP_XqmVk6k-xTf0IahDcaICM2-M,2357 +nltk/tokenize/casual.py,sha256=r3sija_iHN7TX6Nag6zn5fPzuUK41NuBuQbCL3C3Cdk,16101 +nltk/tokenize/destructive.py,sha256=JuoZNTVaRor0RUGTz9s5gpxO3FUHEkshgl4XFsALAPU,9447 +nltk/tokenize/legality_principle.py,sha256=NFSeZrefRtOJh-ZwrwBJnKTzNgU9fnGC9qxWYEhc3bk,6236 +nltk/tokenize/mwe.py,sha256=JE97pMXeHJsE4nJsouUV7yi7-bcyPW0Sq2RQ2Lz0Kf0,4181 +nltk/tokenize/nist.py,sha256=-EXf8gKQOFQmuymmIsslr1RInp4lS20rgXlECRNWbdA,7720 +nltk/tokenize/punkt.py,sha256=-REXxgvg82MdhFHfvULql7oGGZJ88vcRkr6zgmTffBs,68804 +nltk/tokenize/regexp.py,sha256=5vOXADi9dkbviEwx9FJAGRYM9cEEcZJLoRRUVoa05H8,8331 +nltk/tokenize/repp.py,sha256=0c9syu4dcvFjlFt-bwIIlM2uunhtnAc0rcR_QqfjYJ4,8245 +nltk/tokenize/sexpr.py,sha256=-u9VDGHdiGrNX2asJrfu5bJ4c9LEVIw3DyzixeHAx5M,5302 +nltk/tokenize/simple.py,sha256=exh3CZ7vH93YGahyrLvGTtVHhPw8Ga7LNNRHlzShakQ,5379 +nltk/tokenize/sonority_sequencing.py,sha256=M8lWyuA1W28DDIULvkWrpJKEg0LnpS1y9o279jmMJEA,7739 +nltk/tokenize/stanford.py,sha256=SwoXMlDUYw7IvB11nRlSKRHD4aX0gVQi5JpZkdElXVY,3875 +nltk/tokenize/stanford_segmenter.py,sha256=5bRj56yMxsAQgaADDz0KYQZqA7HwtJZegCV2pKy3UsM,9857 +nltk/tokenize/texttiling.py,sha256=lm8h4a_VWTxsSXv1_QL6yrhLwdPM6Qedxi1e2WJJOMk,16943 +nltk/tokenize/toktok.py,sha256=R1eW8VozEtxuwDYjNVn2eWSGSTDvtTKtASMhDvdrdIk,7679 +nltk/tokenize/treebank.py,sha256=6ztsMlKFcZMUwz14KyL6jCqeLp_e8gXdYytu5G4pYoA,16669 +nltk/tokenize/util.py,sha256=mZmQdCozzDouBogbxYhqpIavLbY0_-JXaYFbbYAr8zA,10339 +nltk/translate/__init__.py,sha256=vQjbwJ4E56tPLPQoxzPcuTxUR8iA3MHpXzmXb-mexNo,1331 +nltk/translate/api.py,sha256=ZLDnGOVh3b_0OGIWltihod7nGoN9oI-COCZuFQWxg7I,11109 +nltk/translate/bleu_score.py,sha256=7vFnl7ssSUWsr-Rw_ReViV4vy3_RVYDx4AB75NmTM7M,30415 +nltk/translate/chrf_score.py,sha256=7dcr9yg9Ylkuidau1ZqD9EAb6d9gBm2N9Reo5fMUgl4,8978 +nltk/translate/gale_church.py,sha256=dBA0hUzl5WVShCSJAjFA3Qm9erxYNrdj_ews1233ahQ,8732 +nltk/translate/gdfa.py,sha256=tAJnKb23tB5RAkjxxDUokgYQb31Qm__XIKAaSXSm-mE,6246 +nltk/translate/gleu_score.py,sha256=jPqoVzKXoFFy82w7hXpFtLrve6hFMTjV-7tf-3V0ArU,8831 +nltk/translate/ibm1.py,sha256=MnR2l9vpkyxCAyqGd6ajMUkVxsp5Q6m8Hz--lLzWVMs,9522 +nltk/translate/ibm2.py,sha256=kaGLTJfIrsy3KYj1-72vFYMNtDD8ptJst73mS4NH2nk,12561 +nltk/translate/ibm3.py,sha256=E9aNT2lKkV0I9wwGCvhZPWXk7FUl5VwqV2qPeDUbTck,14154 +nltk/translate/ibm4.py,sha256=xIuMMAlAa1pgf5QR35kSr70u9GbJv2cB-ob5JrhfN8o,20765 +nltk/translate/ibm5.py,sha256=ZquikT1RlMBxMaGK8pzL1DDTwFdQpsopdH4FalKFj-4,27957 +nltk/translate/ibm_model.py,sha256=1lWN-m170zeEBnk8lSbcRXov7rPKPR9ErZyZFFOznLc,20504 +nltk/translate/meteor_score.py,sha256=lgjS7-a4tQRXE8-LyLGTGdDDVqbz5aR_xifjVO3V-8Q,17301 +nltk/translate/metrics.py,sha256=dcb59gZ_UU_aRgSJL-VVutsnsn7Ymu6pgU3zCEqDK0k,1513 +nltk/translate/nist_score.py,sha256=Oqx3_3sMO3_MAk28mQB7SNEcmlLMqObbe_gms7eZTJ0,8148 +nltk/translate/phrase_based.py,sha256=YUKogL5G-sTDxGWdSZ7ctAPpNgLi5othueI8c6lJGwI,7860 +nltk/translate/ribes_score.py,sha256=ya_gCcTw7pWnvDg5_YF5uT3bjoWMoN5QJgYsYq5a36k,14027 +nltk/translate/stack_decoder.py,sha256=pukP5rWH0iTbyWMbl3rhaxo0yBWneL5EDyimSxJThN0,20516 +nltk/tree/__init__.py,sha256=DWOXd4zjM8yCZQBIy6lL1QzNhfQ1Ur_pjLaklsOm8bY,1466 +nltk/tree/immutable.py,sha256=6qS9n496OgokWPbcLW87hMpzq_B7EAMnAos3CaisKVQ,4178 +nltk/tree/parented.py,sha256=B_Fmd3KrzJjjFGGsDXk36GVQd-yTtaJEFnLfcJ7vu0A,23192 +nltk/tree/parsing.py,sha256=ujRP9xFUbmxSEFsTB7G58uYzJRZ0JezBeKEr4okDUzg,2083 +nltk/tree/prettyprinter.py,sha256=92lRKwkLJz6ocEA8QakrTATI3wKPaYTN1ypXY6TCLJM,25586 +nltk/tree/probabilistic.py,sha256=ozvFqkExopuAtcR0VJXHlsrelCxNUBSWTRhJo3517BA,2492 +nltk/tree/transforms.py,sha256=PmprKuO_0pYsSxQPdhnNnJsRmZNOtDYkImwZMoOVfvY,13689 +nltk/tree/tree.py,sha256=M-g91zua8IulCAyC-Ur7JetPGrJ1Qmjf1qtPP9cwRkA,36500 +nltk/twitter/__init__.py,sha256=rOQ4kYsVnrT6tvVBGauh5RQdNVCRZA8cC693K34tnuU,819 +nltk/twitter/api.py,sha256=RG3OWvtHhmKqhslO1ugOZuVKaEbmd2_ymSBSM2pQKtw,4692 +nltk/twitter/common.py,sha256=RT0IuDPn8FSwiECDJurJ_GYbvVMJdFuGdhCSbwgzCjg,10120 +nltk/twitter/twitter_demo.py,sha256=oJMUXKn-Oh4ivzeHIBgDT8Uw5Bv_4pfSvS64-idpvjc,8309 +nltk/twitter/twitterclient.py,sha256=q9wjPcRC0_9iVpMJJ9ks83mBqYidjHC6GS9oLjK_4mk,19927 +nltk/twitter/util.py,sha256=tAX06opl6f5tkQevUCQrWSuJPryN8uX0wQzXrPP3onc,4546 +nltk-3.8.dist-info/AUTHORS.md,sha256=lwegiKq14iCouEfpgu85VSAWadP2X1MkLhUsgYBfPOI,7628 +nltk-3.8.dist-info/LICENSE.txt,sha256=Pd-b5cKP4n2tFDpdx27qJSIq0d1ok0oEcGTlbtL6QMU,11560 +nltk-3.8.dist-info/METADATA,sha256=5uTwnZUdosOCnF_3sl0R50-HLsI0Gz3QpotDQZad3Tg,2788 +nltk-3.8.dist-info/README.md,sha256=kH44ngAu4f_oXUJogH0JutBHNlSkuKWN3ypNtDwXn8E,1783 +nltk-3.8.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92 +nltk-3.8.dist-info/entry_points.txt,sha256=SK6SzMicwUtBiwUOmv5P1ZVs0h-xqey6PnRpsUGGx5c,37 +nltk-3.8.dist-info/top_level.txt,sha256=YoQ-mwqckmTv1Qktmlk5Ylb6lDG77jg5qwoEB7c-pXo,5 +nltk-3.8.dist-info/RECORD,, diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/WHEEL b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/WHEEL new file mode 100644 index 0000000000000000000000000000000000000000..5bad85fdc1cd08553756d0fb2c7be8b5ad6af7fb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: bdist_wheel (0.37.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/entry_points.txt b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..65a3a33374385d4199dda32bb7a3284f52653e66 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/entry_points.txt @@ -0,0 +1,3 @@ + +[console_scripts] +nltk=nltk.cli:cli diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/requires.txt b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfd6fa88204b2856d1716bec28635de5b51504a7 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/requires.txt @@ -0,0 +1,32 @@ +click +joblib +regex>=2021.8.3 +tqdm + +[all] +numpy +python-crfsuite +scikit-learn +pyparsing +scipy +twython +matplotlib +requests + +[corenlp] +requests + +[machine-learning] +numpy +python-crfsuite +scikit-learn +scipy + +[plot] +matplotlib + +[tgrep] +pyparsing + +[twitter] +twython diff --git a/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/top_level.txt b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..846929621767476398799cbeff31ac8ae954578f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/EGG-INFO/top_level.txt @@ -0,0 +1 @@ +nltk diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/VERSION b/.eggs/nltk-3.8-py3.10.egg/nltk/VERSION new file mode 100644 index 0000000000000000000000000000000000000000..9b4df3bda6abb040ceea6f686acc355b52d6acc8 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/VERSION @@ -0,0 +1 @@ +3.8 diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c1262d93d23ff244220c286a4379ecf583b4b543 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/__init__.py @@ -0,0 +1,208 @@ +# Natural Language Toolkit (NLTK) +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +The Natural Language Toolkit (NLTK) is an open source Python library +for Natural Language Processing. A free online book is available. +(If you use the library for academic research, please cite the book.) + +Steven Bird, Ewan Klein, and Edward Loper (2009). +Natural Language Processing with Python. O'Reilly Media Inc. +https://www.nltk.org/book/ + +isort:skip_file +""" + +import os + +# ////////////////////////////////////////////////////// +# Metadata +# ////////////////////////////////////////////////////// + +# Version. For each new release, the version number should be updated +# in the file VERSION. +try: + # If a VERSION file exists, use it! + version_file = os.path.join(os.path.dirname(__file__), "VERSION") + with open(version_file) as infile: + __version__ = infile.read().strip() +except NameError: + __version__ = "unknown (running code interactively?)" +except OSError as ex: + __version__ = "unknown (%s)" % ex + +if __doc__ is not None: # fix for the ``python -OO`` + __doc__ += "\n@version: " + __version__ + + +# Copyright notice +__copyright__ = """\ +Copyright (C) 2001-2022 NLTK Project. + +Distributed and Licensed under the Apache License, Version 2.0, +which is included by reference. +""" + +__license__ = "Apache License, Version 2.0" +# Description of the toolkit, keywords, and the project's primary URL. +__longdescr__ = """\ +The Natural Language Toolkit (NLTK) is a Python package for +natural language processing. NLTK requires Python 3.7, 3.8, 3.9 or 3.10.""" +__keywords__ = [ + "NLP", + "CL", + "natural language processing", + "computational linguistics", + "parsing", + "tagging", + "tokenizing", + "syntax", + "linguistics", + "language", + "natural language", + "text analytics", +] +__url__ = "https://www.nltk.org/" + +# Maintainer, contributors, etc. +__maintainer__ = "NLTK Team" +__maintainer_email__ = "nltk.team@gmail.com" +__author__ = __maintainer__ +__author_email__ = __maintainer_email__ + +# "Trove" classifiers for Python Package Index. +__classifiers__ = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Human Machine Interfaces", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Text Processing", + "Topic :: Text Processing :: Filters", + "Topic :: Text Processing :: General", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", +] + +from nltk.internals import config_java + +# support numpy from pypy +try: + import numpypy +except ImportError: + pass + +# Override missing methods on environments where it cannot be used like GAE. +import subprocess + +if not hasattr(subprocess, "PIPE"): + + def _fake_PIPE(*args, **kwargs): + raise NotImplementedError("subprocess.PIPE is not supported.") + + subprocess.PIPE = _fake_PIPE +if not hasattr(subprocess, "Popen"): + + def _fake_Popen(*args, **kwargs): + raise NotImplementedError("subprocess.Popen is not supported.") + + subprocess.Popen = _fake_Popen + +########################################################### +# TOP-LEVEL MODULES +########################################################### + +# Import top-level functionality into top-level namespace + +from nltk.collocations import * +from nltk.decorators import decorator, memoize +from nltk.featstruct import * +from nltk.grammar import * +from nltk.probability import * +from nltk.text import * +from nltk.util import * +from nltk.jsontags import * + +########################################################### +# PACKAGES +########################################################### + +from nltk.chunk import * +from nltk.classify import * +from nltk.inference import * +from nltk.metrics import * +from nltk.parse import * +from nltk.tag import * +from nltk.tokenize import * +from nltk.translate import * +from nltk.tree import * +from nltk.sem import * +from nltk.stem import * + +# Packages which can be lazily imported +# (a) we don't import * +# (b) they're slow to import or have run-time dependencies +# that can safely fail at run time + +from nltk import lazyimport + +app = lazyimport.LazyModule("app", locals(), globals()) +chat = lazyimport.LazyModule("chat", locals(), globals()) +corpus = lazyimport.LazyModule("corpus", locals(), globals()) +draw = lazyimport.LazyModule("draw", locals(), globals()) +toolbox = lazyimport.LazyModule("toolbox", locals(), globals()) + +# Optional loading + +try: + import numpy +except ImportError: + pass +else: + from nltk import cluster + +from nltk.downloader import download, download_shell + +try: + import tkinter +except ImportError: + pass +else: + try: + from nltk.downloader import download_gui + except RuntimeError as e: + import warnings + + warnings.warn( + "Corpus downloader GUI not loaded " + "(RuntimeError during import: %s)" % str(e) + ) + +# explicitly import all top-level modules (ensuring +# they override the same names inadvertently imported +# from a subpackage) + +from nltk import ccg, chunk, classify, collocations +from nltk import data, featstruct, grammar, help, inference, metrics +from nltk import misc, parse, probability, sem, stem, wsd +from nltk import tag, tbl, text, tokenize, translate, tree, util + + +# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116 +def demo(): + print("To run the demo code for a module, type nltk.module.demo()") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6b5941d2f419d952201301e4b5885f6795a5146a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/__init__.py @@ -0,0 +1,47 @@ +# Natural Language Toolkit: Applications package +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Interactive NLTK Applications: + +chartparser: Chart Parser +chunkparser: Regular-Expression Chunk Parser +collocations: Find collocations in text +concordance: Part-of-speech concordancer +nemo: Finding (and Replacing) Nemo regular expression tool +rdparser: Recursive Descent Parser +srparser: Shift-Reduce Parser +wordnet: WordNet Browser +""" + + +# Import Tkinter-based modules if Tkinter is installed +try: + import tkinter +except ImportError: + import warnings + + warnings.warn("nltk.app package not loaded (please install Tkinter library).") +else: + from nltk.app.chartparser_app import app as chartparser + from nltk.app.chunkparser_app import app as chunkparser + from nltk.app.collocations_app import app as collocations + from nltk.app.concordance_app import app as concordance + from nltk.app.nemo_app import app as nemo + from nltk.app.rdparser_app import app as rdparser + from nltk.app.srparser_app import app as srparser + from nltk.app.wordnet_app import app as wordnet + + try: + from matplotlib import pylab + except ImportError: + import warnings + + warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).") + else: + from nltk.app.wordfreq_app import app as wordfreq diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/app/chartparser_app.py b/.eggs/nltk-3.8-py3.10.egg/nltk/app/chartparser_app.py new file mode 100644 index 0000000000000000000000000000000000000000..738b07a5f06c0be6e72a0d32bdde85cee06fb028 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/app/chartparser_app.py @@ -0,0 +1,2569 @@ +# Natural Language Toolkit: Chart Parser Application +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Jean Mark Gawron +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +A graphical tool for exploring chart parsing. + +Chart parsing is a flexible parsing algorithm that uses a data +structure called a "chart" to record hypotheses about syntactic +constituents. Each hypothesis is represented by a single "edge" on +the chart. A set of "chart rules" determine when new edges can be +added to the chart. This set of rules controls the overall behavior +of the parser (e.g. whether it parses top-down or bottom-up). + +The chart parsing tool demonstrates the process of parsing a single +sentence, with a given grammar and lexicon. Its display is divided +into three sections: the bottom section displays the chart; the middle +section displays the sentence; and the top section displays the +partial syntax tree corresponding to the selected edge. Buttons along +the bottom of the window are used to control the execution of the +algorithm. + +The chart parsing tool allows for flexible control of the parsing +algorithm. At each step of the algorithm, you can select which rule +or strategy you wish to apply. This allows you to experiment with +mixing different strategies (e.g. top-down and bottom-up). You can +exercise fine-grained control over the algorithm by selecting which +edge you wish to apply a rule to. +""" + +# At some point, we should rewrite this tool to use the new canvas +# widget system. + + +import os.path +import pickle +from tkinter import ( + Button, + Canvas, + Checkbutton, + Frame, + IntVar, + Label, + Menu, + Scrollbar, + Tk, + Toplevel, +) +from tkinter.filedialog import askopenfilename, asksaveasfilename +from tkinter.font import Font +from tkinter.messagebox import showerror, showinfo + +from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment +from nltk.draw.util import ( + CanvasFrame, + ColorizedList, + EntryDialog, + MutableOptionMenu, + ShowText, + SymbolWidget, +) +from nltk.grammar import CFG, Nonterminal +from nltk.parse.chart import ( + BottomUpPredictCombineRule, + BottomUpPredictRule, + Chart, + LeafEdge, + LeafInitRule, + SingleEdgeFundamentalRule, + SteppingChartParser, + TopDownInitRule, + TopDownPredictRule, + TreeEdge, +) +from nltk.tree import Tree +from nltk.util import in_idle + +# Known bug: ChartView doesn't handle edges generated by epsilon +# productions (e.g., [Production: PP -> ]) very well. + +####################################################################### +# Edge List +####################################################################### + + +class EdgeList(ColorizedList): + ARROW = SymbolWidget.SYMBOLS["rightarrow"] + + def _init_colortags(self, textwidget, options): + textwidget.tag_config("terminal", foreground="#006000") + textwidget.tag_config("arrow", font="symbol", underline="0") + textwidget.tag_config("dot", foreground="#000000") + textwidget.tag_config( + "nonterminal", foreground="blue", font=("helvetica", -12, "bold") + ) + + def _item_repr(self, item): + contents = [] + contents.append(("%s\t" % item.lhs(), "nonterminal")) + contents.append((self.ARROW, "arrow")) + for i, elt in enumerate(item.rhs()): + if i == item.dot(): + contents.append((" *", "dot")) + if isinstance(elt, Nonterminal): + contents.append((" %s" % elt.symbol(), "nonterminal")) + else: + contents.append((" %r" % elt, "terminal")) + if item.is_complete(): + contents.append((" *", "dot")) + return contents + + +####################################################################### +# Chart Matrix View +####################################################################### + + +class ChartMatrixView: + """ + A view of a chart that displays the contents of the corresponding matrix. + """ + + def __init__( + self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False + ): + self._chart = chart + self._cells = [] + self._marks = [] + + self._selected_cell = None + + if toplevel: + self._root = Toplevel(parent) + self._root.title(title) + self._root.bind("", self.destroy) + self._init_quit(self._root) + else: + self._root = Frame(parent) + + self._init_matrix(self._root) + self._init_list(self._root) + if show_numedges: + self._init_numedges(self._root) + else: + self._numedges_label = None + + self._callbacks = {} + + self._num_edges = 0 + + self.draw() + + def _init_quit(self, root): + quit = Button(root, text="Quit", command=self.destroy) + quit.pack(side="bottom", expand=0, fill="none") + + def _init_matrix(self, root): + cframe = Frame(root, border=2, relief="sunken") + cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top") + self._canvas = Canvas(cframe, width=200, height=200, background="white") + self._canvas.pack(expand=0, fill="none") + + def _init_numedges(self, root): + self._numedges_label = Label(root, text="0 edges") + self._numedges_label.pack(expand=0, fill="none", side="top") + + def _init_list(self, root): + self._list = EdgeList(root, [], width=20, height=5) + self._list.pack(side="top", expand=1, fill="both", pady=3) + + def cb(edge, self=self): + self._fire_callbacks("select", edge) + + self._list.add_callback("select", cb) + self._list.focus() + + def destroy(self, *e): + if self._root is None: + return + try: + self._root.destroy() + except: + pass + self._root = None + + def set_chart(self, chart): + if chart is not self._chart: + self._chart = chart + self._num_edges = 0 + self.draw() + + def update(self): + if self._root is None: + return + + # Count the edges in each cell + N = len(self._cells) + cell_edges = [[0 for i in range(N)] for j in range(N)] + for edge in self._chart: + cell_edges[edge.start()][edge.end()] += 1 + + # Color the cells correspondingly. + for i in range(N): + for j in range(i, N): + if cell_edges[i][j] == 0: + color = "gray20" + else: + color = "#00{:02x}{:02x}".format( + min(255, 50 + 128 * cell_edges[i][j] / 10), + max(0, 128 - 128 * cell_edges[i][j] / 10), + ) + cell_tag = self._cells[i][j] + self._canvas.itemconfig(cell_tag, fill=color) + if (i, j) == self._selected_cell: + self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3) + self._canvas.tag_raise(cell_tag) + else: + self._canvas.itemconfig(cell_tag, outline="black", width=1) + + # Update the edge list. + edges = list(self._chart.select(span=self._selected_cell)) + self._list.set(edges) + + # Update our edge count. + self._num_edges = self._chart.num_edges() + if self._numedges_label is not None: + self._numedges_label["text"] = "%d edges" % self._num_edges + + def activate(self): + self._canvas.itemconfig("inactivebox", state="hidden") + self.update() + + def inactivate(self): + self._canvas.itemconfig("inactivebox", state="normal") + self.update() + + def add_callback(self, event, func): + self._callbacks.setdefault(event, {})[func] = 1 + + def remove_callback(self, event, func=None): + if func is None: + del self._callbacks[event] + else: + try: + del self._callbacks[event][func] + except: + pass + + def _fire_callbacks(self, event, *args): + if event not in self._callbacks: + return + for cb_func in list(self._callbacks[event].keys()): + cb_func(*args) + + def select_cell(self, i, j): + if self._root is None: + return + + # If the cell is already selected (and the chart contents + # haven't changed), then do nothing. + if (i, j) == self._selected_cell and self._chart.num_edges() == self._num_edges: + return + + self._selected_cell = (i, j) + self.update() + + # Fire the callback. + self._fire_callbacks("select_cell", i, j) + + def deselect_cell(self): + if self._root is None: + return + self._selected_cell = None + self._list.set([]) + self.update() + + def _click_cell(self, i, j): + if self._selected_cell == (i, j): + self.deselect_cell() + else: + self.select_cell(i, j) + + def view_edge(self, edge): + self.select_cell(*edge.span()) + self._list.view(edge) + + def mark_edge(self, edge): + if self._root is None: + return + self.select_cell(*edge.span()) + self._list.mark(edge) + + def unmark_edge(self, edge=None): + if self._root is None: + return + self._list.unmark(edge) + + def markonly_edge(self, edge): + if self._root is None: + return + self.select_cell(*edge.span()) + self._list.markonly(edge) + + def draw(self): + if self._root is None: + return + LEFT_MARGIN = BOT_MARGIN = 15 + TOP_MARGIN = 5 + c = self._canvas + c.delete("all") + N = self._chart.num_leaves() + 1 + dx = (int(c["width"]) - LEFT_MARGIN) / N + dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N + + c.delete("all") + + # Labels and dotted lines + for i in range(N): + c.create_text( + LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e" + ) + c.create_text( + i * dx + dx / 2 + LEFT_MARGIN, + N * dy + TOP_MARGIN + 1, + text=repr(i), + anchor="n", + ) + c.create_line( + LEFT_MARGIN, + dy * (i + 1) + TOP_MARGIN, + dx * N + LEFT_MARGIN, + dy * (i + 1) + TOP_MARGIN, + dash=".", + ) + c.create_line( + dx * i + LEFT_MARGIN, + TOP_MARGIN, + dx * i + LEFT_MARGIN, + dy * N + TOP_MARGIN, + dash=".", + ) + + # A box around the whole thing + c.create_rectangle( + LEFT_MARGIN, TOP_MARGIN, LEFT_MARGIN + dx * N, dy * N + TOP_MARGIN, width=2 + ) + + # Cells + self._cells = [[None for i in range(N)] for j in range(N)] + for i in range(N): + for j in range(i, N): + t = c.create_rectangle( + j * dx + LEFT_MARGIN, + i * dy + TOP_MARGIN, + (j + 1) * dx + LEFT_MARGIN, + (i + 1) * dy + TOP_MARGIN, + fill="gray20", + ) + self._cells[i][j] = t + + def cb(event, self=self, i=i, j=j): + self._click_cell(i, j) + + c.tag_bind(t, "", cb) + + # Inactive box + xmax, ymax = int(c["width"]), int(c["height"]) + t = c.create_rectangle( + -100, + -100, + xmax + 100, + ymax + 100, + fill="gray50", + state="hidden", + tag="inactivebox", + ) + c.tag_lower(t) + + # Update the cells. + self.update() + + def pack(self, *args, **kwargs): + self._root.pack(*args, **kwargs) + + +####################################################################### +# Chart Results View +####################################################################### + + +class ChartResultsView: + def __init__(self, parent, chart, grammar, toplevel=True): + self._chart = chart + self._grammar = grammar + self._trees = [] + self._y = 10 + self._treewidgets = [] + self._selection = None + self._selectbox = None + + if toplevel: + self._root = Toplevel(parent) + self._root.title("Chart Parser Application: Results") + self._root.bind("", self.destroy) + else: + self._root = Frame(parent) + + # Buttons + if toplevel: + buttons = Frame(self._root) + buttons.pack(side="bottom", expand=0, fill="x") + Button(buttons, text="Quit", command=self.destroy).pack(side="right") + Button(buttons, text="Print All", command=self.print_all).pack(side="left") + Button(buttons, text="Print Selection", command=self.print_selection).pack( + side="left" + ) + + # Canvas frame. + self._cframe = CanvasFrame(self._root, closeenough=20) + self._cframe.pack(side="top", expand=1, fill="both") + + # Initial update + self.update() + + def update(self, edge=None): + if self._root is None: + return + # If the edge isn't a parse edge, do nothing. + if edge is not None: + if edge.lhs() != self._grammar.start(): + return + if edge.span() != (0, self._chart.num_leaves()): + return + + for parse in self._chart.parses(self._grammar.start()): + if parse not in self._trees: + self._add(parse) + + def _add(self, parse): + # Add it to self._trees. + self._trees.append(parse) + + # Create a widget for it. + c = self._cframe.canvas() + treewidget = tree_to_treesegment(c, parse) + + # Add it to the canvas frame. + self._treewidgets.append(treewidget) + self._cframe.add_widget(treewidget, 10, self._y) + + # Register callbacks. + treewidget.bind_click(self._click) + + # Update y. + self._y = treewidget.bbox()[3] + 10 + + def _click(self, widget): + c = self._cframe.canvas() + if self._selection is not None: + c.delete(self._selectbox) + self._selection = widget + (x1, y1, x2, y2) = widget.bbox() + self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088") + + def _color(self, treewidget, color): + treewidget.label()["color"] = color + for child in treewidget.subtrees(): + if isinstance(child, TreeSegmentWidget): + self._color(child, color) + else: + child["color"] = color + + def print_all(self, *e): + if self._root is None: + return + self._cframe.print_to_file() + + def print_selection(self, *e): + if self._root is None: + return + if self._selection is None: + showerror("Print Error", "No tree selected") + else: + c = self._cframe.canvas() + for widget in self._treewidgets: + if widget is not self._selection: + self._cframe.destroy_widget(widget) + c.delete(self._selectbox) + (x1, y1, x2, y2) = self._selection.bbox() + self._selection.move(10 - x1, 10 - y1) + c["scrollregion"] = f"0 0 {x2 - x1 + 20} {y2 - y1 + 20}" + self._cframe.print_to_file() + + # Restore our state. + self._treewidgets = [self._selection] + self.clear() + self.update() + + def clear(self): + if self._root is None: + return + for treewidget in self._treewidgets: + self._cframe.destroy_widget(treewidget) + self._trees = [] + self._treewidgets = [] + if self._selection is not None: + self._cframe.canvas().delete(self._selectbox) + self._selection = None + self._y = 10 + + def set_chart(self, chart): + self.clear() + self._chart = chart + self.update() + + def set_grammar(self, grammar): + self.clear() + self._grammar = grammar + self.update() + + def destroy(self, *e): + if self._root is None: + return + try: + self._root.destroy() + except: + pass + self._root = None + + def pack(self, *args, **kwargs): + self._root.pack(*args, **kwargs) + + +####################################################################### +# Chart Comparer +####################################################################### + + +class ChartComparer: + """ + + :ivar _root: The root window + + :ivar _charts: A dictionary mapping names to charts. When + charts are loaded, they are added to this dictionary. + + :ivar _left_chart: The left ``Chart``. + :ivar _left_name: The name ``_left_chart`` (derived from filename) + :ivar _left_matrix: The ``ChartMatrixView`` for ``_left_chart`` + :ivar _left_selector: The drop-down ``MutableOptionsMenu`` used + to select ``_left_chart``. + + :ivar _right_chart: The right ``Chart``. + :ivar _right_name: The name ``_right_chart`` (derived from filename) + :ivar _right_matrix: The ``ChartMatrixView`` for ``_right_chart`` + :ivar _right_selector: The drop-down ``MutableOptionsMenu`` used + to select ``_right_chart``. + + :ivar _out_chart: The out ``Chart``. + :ivar _out_name: The name ``_out_chart`` (derived from filename) + :ivar _out_matrix: The ``ChartMatrixView`` for ``_out_chart`` + :ivar _out_label: The label for ``_out_chart``. + + :ivar _op_label: A Label containing the most recent operation. + """ + + _OPSYMBOL = { + "-": "-", + "and": SymbolWidget.SYMBOLS["intersection"], + "or": SymbolWidget.SYMBOLS["union"], + } + + def __init__(self, *chart_filenames): + # This chart is displayed when we don't have a value (eg + # before any chart is loaded). + faketok = [""] * 8 + self._emptychart = Chart(faketok) + + # The left & right charts start out empty. + self._left_name = "None" + self._right_name = "None" + self._left_chart = self._emptychart + self._right_chart = self._emptychart + + # The charts that have been loaded. + self._charts = {"None": self._emptychart} + + # The output chart. + self._out_chart = self._emptychart + + # The most recent operation + self._operator = None + + # Set up the root window. + self._root = Tk() + self._root.title("Chart Comparison") + self._root.bind("", self.destroy) + self._root.bind("", self.destroy) + + # Initialize all widgets, etc. + self._init_menubar(self._root) + self._init_chartviews(self._root) + self._init_divider(self._root) + self._init_buttons(self._root) + self._init_bindings(self._root) + + # Load any specified charts. + for filename in chart_filenames: + self.load_chart(filename) + + def destroy(self, *e): + if self._root is None: + return + try: + self._root.destroy() + except: + pass + self._root = None + + def mainloop(self, *args, **kwargs): + return + self._root.mainloop(*args, **kwargs) + + # //////////////////////////////////////////////////////////// + # Initialization + # //////////////////////////////////////////////////////////// + + def _init_menubar(self, root): + menubar = Menu(root) + + # File menu + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Load Chart", + accelerator="Ctrl-o", + underline=0, + command=self.load_chart_dialog, + ) + filemenu.add_command( + label="Save Output", + accelerator="Ctrl-s", + underline=0, + command=self.save_chart_dialog, + ) + filemenu.add_separator() + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + # Compare menu + opmenu = Menu(menubar, tearoff=0) + opmenu.add_command( + label="Intersection", command=self._intersection, accelerator="+" + ) + opmenu.add_command(label="Union", command=self._union, accelerator="*") + opmenu.add_command( + label="Difference", command=self._difference, accelerator="-" + ) + opmenu.add_separator() + opmenu.add_command(label="Swap Charts", command=self._swapcharts) + menubar.add_cascade(label="Compare", underline=0, menu=opmenu) + + # Add the menu + self._root.config(menu=menubar) + + def _init_divider(self, root): + divider = Frame(root, border=2, relief="sunken") + divider.pack(side="top", fill="x", ipady=2) + + def _init_chartviews(self, root): + opfont = ("symbol", -36) # Font for operator. + eqfont = ("helvetica", -36) # Font for equals sign. + + frame = Frame(root, background="#c0c0c0") + frame.pack(side="top", expand=1, fill="both") + + # The left matrix. + cv1_frame = Frame(frame, border=3, relief="groove") + cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") + self._left_selector = MutableOptionMenu( + cv1_frame, list(self._charts.keys()), command=self._select_left + ) + self._left_selector.pack(side="top", pady=5, fill="x") + self._left_matrix = ChartMatrixView( + cv1_frame, self._emptychart, toplevel=False, show_numedges=True + ) + self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") + self._left_matrix.add_callback("select", self.select_edge) + self._left_matrix.add_callback("select_cell", self.select_cell) + self._left_matrix.inactivate() + + # The operator. + self._op_label = Label( + frame, text=" ", width=3, background="#c0c0c0", font=opfont + ) + self._op_label.pack(side="left", padx=5, pady=5) + + # The right matrix. + cv2_frame = Frame(frame, border=3, relief="groove") + cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") + self._right_selector = MutableOptionMenu( + cv2_frame, list(self._charts.keys()), command=self._select_right + ) + self._right_selector.pack(side="top", pady=5, fill="x") + self._right_matrix = ChartMatrixView( + cv2_frame, self._emptychart, toplevel=False, show_numedges=True + ) + self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") + self._right_matrix.add_callback("select", self.select_edge) + self._right_matrix.add_callback("select_cell", self.select_cell) + self._right_matrix.inactivate() + + # The equals sign + Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack( + side="left", padx=5, pady=5 + ) + + # The output matrix. + out_frame = Frame(frame, border=3, relief="groove") + out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") + self._out_label = Label(out_frame, text="Output") + self._out_label.pack(side="top", pady=9) + self._out_matrix = ChartMatrixView( + out_frame, self._emptychart, toplevel=False, show_numedges=True + ) + self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") + self._out_matrix.add_callback("select", self.select_edge) + self._out_matrix.add_callback("select_cell", self.select_cell) + self._out_matrix.inactivate() + + def _init_buttons(self, root): + buttons = Frame(root) + buttons.pack(side="bottom", pady=5, fill="x", expand=0) + Button(buttons, text="Intersection", command=self._intersection).pack( + side="left" + ) + Button(buttons, text="Union", command=self._union).pack(side="left") + Button(buttons, text="Difference", command=self._difference).pack(side="left") + Frame(buttons, width=20).pack(side="left") + Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left") + + Button(buttons, text="Detach Output", command=self._detach_out).pack( + side="right" + ) + + def _init_bindings(self, root): + # root.bind('', self.save_chart) + root.bind("", self.load_chart_dialog) + # root.bind('', self.reset) + + # //////////////////////////////////////////////////////////// + # Input Handling + # //////////////////////////////////////////////////////////// + + def _select_left(self, name): + self._left_name = name + self._left_chart = self._charts[name] + self._left_matrix.set_chart(self._left_chart) + if name == "None": + self._left_matrix.inactivate() + self._apply_op() + + def _select_right(self, name): + self._right_name = name + self._right_chart = self._charts[name] + self._right_matrix.set_chart(self._right_chart) + if name == "None": + self._right_matrix.inactivate() + self._apply_op() + + def _apply_op(self): + if self._operator == "-": + self._difference() + elif self._operator == "or": + self._union() + elif self._operator == "and": + self._intersection() + + # //////////////////////////////////////////////////////////// + # File + # //////////////////////////////////////////////////////////// + CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] + + def save_chart_dialog(self, *args): + filename = asksaveasfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) + if not filename: + return + try: + with open(filename, "wb") as outfile: + pickle.dump(self._out_chart, outfile) + except Exception as e: + showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart_dialog(self, *args): + filename = askopenfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) + if not filename: + return + try: + self.load_chart(filename) + except Exception as e: + showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}") + + def load_chart(self, filename): + with open(filename, "rb") as infile: + chart = pickle.load(infile) + name = os.path.basename(filename) + if name.endswith(".pickle"): + name = name[:-7] + if name.endswith(".chart"): + name = name[:-6] + self._charts[name] = chart + self._left_selector.add(name) + self._right_selector.add(name) + + # If either left_matrix or right_matrix is empty, then + # display the new chart. + if self._left_chart is self._emptychart: + self._left_selector.set(name) + elif self._right_chart is self._emptychart: + self._right_selector.set(name) + + def _update_chartviews(self): + self._left_matrix.update() + self._right_matrix.update() + self._out_matrix.update() + + # //////////////////////////////////////////////////////////// + # Selection + # //////////////////////////////////////////////////////////// + + def select_edge(self, edge): + if edge in self._left_chart: + self._left_matrix.markonly_edge(edge) + else: + self._left_matrix.unmark_edge() + if edge in self._right_chart: + self._right_matrix.markonly_edge(edge) + else: + self._right_matrix.unmark_edge() + if edge in self._out_chart: + self._out_matrix.markonly_edge(edge) + else: + self._out_matrix.unmark_edge() + + def select_cell(self, i, j): + self._left_matrix.select_cell(i, j) + self._right_matrix.select_cell(i, j) + self._out_matrix.select_cell(i, j) + + # //////////////////////////////////////////////////////////// + # Operations + # //////////////////////////////////////////////////////////// + + def _difference(self): + if not self._checkcompat(): + return + + out_chart = Chart(self._left_chart.tokens()) + for edge in self._left_chart: + if edge not in self._right_chart: + out_chart.insert(edge, []) + + self._update("-", out_chart) + + def _intersection(self): + if not self._checkcompat(): + return + + out_chart = Chart(self._left_chart.tokens()) + for edge in self._left_chart: + if edge in self._right_chart: + out_chart.insert(edge, []) + + self._update("and", out_chart) + + def _union(self): + if not self._checkcompat(): + return + + out_chart = Chart(self._left_chart.tokens()) + for edge in self._left_chart: + out_chart.insert(edge, []) + for edge in self._right_chart: + out_chart.insert(edge, []) + + self._update("or", out_chart) + + def _swapcharts(self): + left, right = self._left_name, self._right_name + self._left_selector.set(right) + self._right_selector.set(left) + + def _checkcompat(self): + if ( + self._left_chart.tokens() != self._right_chart.tokens() + or self._left_chart.property_names() != self._right_chart.property_names() + or self._left_chart == self._emptychart + or self._right_chart == self._emptychart + ): + # Clear & inactivate the output chart. + self._out_chart = self._emptychart + self._out_matrix.set_chart(self._out_chart) + self._out_matrix.inactivate() + self._out_label["text"] = "Output" + # Issue some other warning? + return False + else: + return True + + def _update(self, operator, out_chart): + self._operator = operator + self._op_label["text"] = self._OPSYMBOL[operator] + self._out_chart = out_chart + self._out_matrix.set_chart(out_chart) + self._out_label["text"] = "{} {} {}".format( + self._left_name, + self._operator, + self._right_name, + ) + + def _clear_out_chart(self): + self._out_chart = self._emptychart + self._out_matrix.set_chart(self._out_chart) + self._op_label["text"] = " " + self._out_matrix.inactivate() + + def _detach_out(self): + ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"]) + + +####################################################################### +# Chart View +####################################################################### + + +class ChartView: + """ + A component for viewing charts. This is used by ``ChartParserApp`` to + allow students to interactively experiment with various chart + parsing techniques. It is also used by ``Chart.draw()``. + + :ivar _chart: The chart that we are giving a view of. This chart + may be modified; after it is modified, you should call + ``update``. + :ivar _sentence: The list of tokens that the chart spans. + + :ivar _root: The root window. + :ivar _chart_canvas: The canvas we're using to display the chart + itself. + :ivar _tree_canvas: The canvas we're using to display the tree + that each edge spans. May be None, if we're not displaying + trees. + :ivar _sentence_canvas: The canvas we're using to display the sentence + text. May be None, if we're not displaying the sentence text. + :ivar _edgetags: A dictionary mapping from edges to the tags of + the canvas elements (lines, etc) used to display that edge. + The values of this dictionary have the form + ``(linetag, rhstag1, dottag, rhstag2, lhstag)``. + :ivar _treetags: A list of all the tags that make up the tree; + used to erase the tree (without erasing the loclines). + :ivar _chart_height: The height of the chart canvas. + :ivar _sentence_height: The height of the sentence canvas. + :ivar _tree_height: The height of the tree + + :ivar _text_height: The height of a text string (in the normal + font). + + :ivar _edgelevels: A list of edges at each level of the chart (the + top level is the 0th element). This list is used to remember + where edges should be drawn; and to make sure that no edges + are overlapping on the chart view. + + :ivar _unitsize: Pixel size of one unit (from the location). This + is determined by the span of the chart's location, and the + width of the chart display canvas. + + :ivar _fontsize: The current font size + + :ivar _marks: A dictionary from edges to marks. Marks are + strings, specifying colors (e.g. 'green'). + """ + + _LEAF_SPACING = 10 + _MARGIN = 10 + _TREE_LEVEL_SIZE = 12 + _CHART_LEVEL_SIZE = 40 + + def __init__(self, chart, root=None, **kw): + """ + Construct a new ``Chart`` display. + """ + # Process keyword args. + draw_tree = kw.get("draw_tree", 0) + draw_sentence = kw.get("draw_sentence", 1) + self._fontsize = kw.get("fontsize", -12) + + # The chart! + self._chart = chart + + # Callback functions + self._callbacks = {} + + # Keep track of drawn edges + self._edgelevels = [] + self._edgetags = {} + + # Keep track of which edges are marked. + self._marks = {} + + # These are used to keep track of the set of tree tokens + # currently displayed in the tree canvas. + self._treetoks = [] + self._treetoks_edge = None + self._treetoks_index = 0 + + # Keep track of the tags used to draw the tree + self._tree_tags = [] + + # Put multiple edges on each level? + self._compact = 0 + + # If they didn't provide a main window, then set one up. + if root is None: + top = Tk() + top.title("Chart View") + + def destroy1(e, top=top): + top.destroy() + + def destroy2(top=top): + top.destroy() + + top.bind("q", destroy1) + b = Button(top, text="Done", command=destroy2) + b.pack(side="bottom") + self._root = top + else: + self._root = root + + # Create some fonts. + self._init_fonts(root) + + # Create the chart canvas. + (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root) + self._chart_canvas["height"] = 300 + self._chart_canvas["closeenough"] = 15 + + # Create the sentence canvas. + if draw_sentence: + cframe = Frame(self._root, relief="sunk", border=2) + cframe.pack(fill="both", side="bottom") + self._sentence_canvas = Canvas(cframe, height=50) + self._sentence_canvas["background"] = "#e0e0e0" + self._sentence_canvas.pack(fill="both") + # self._sentence_canvas['height'] = self._sentence_height + else: + self._sentence_canvas = None + + # Create the tree canvas. + if draw_tree: + (sb, canvas) = self._sb_canvas(self._root, "n", "x") + (self._tree_sb, self._tree_canvas) = (sb, canvas) + self._tree_canvas["height"] = 200 + else: + self._tree_canvas = None + + # Do some analysis to figure out how big the window should be + self._analyze() + self.draw() + self._resize() + self._grow() + + # Set up the configure callback, which will be called whenever + # the window is resized. + self._chart_canvas.bind("", self._configure) + + def _init_fonts(self, root): + self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize) + self._font = Font(family="helvetica", size=self._fontsize) + # See: + self._sysfont = Font(font=Button()["font"]) + root.option_add("*Font", self._sysfont) + + def _sb_canvas(self, root, expand="y", fill="both", side="bottom"): + """ + Helper for __init__: construct a canvas with a scrollbar. + """ + cframe = Frame(root, relief="sunk", border=2) + cframe.pack(fill=fill, expand=expand, side=side) + canvas = Canvas(cframe, background="#e0e0e0") + + # Give the canvas a scrollbar. + sb = Scrollbar(cframe, orient="vertical") + sb.pack(side="right", fill="y") + canvas.pack(side="left", fill=fill, expand="yes") + + # Connect the scrollbars to the canvas. + sb["command"] = canvas.yview + canvas["yscrollcommand"] = sb.set + + return (sb, canvas) + + def scroll_up(self, *e): + self._chart_canvas.yview("scroll", -1, "units") + + def scroll_down(self, *e): + self._chart_canvas.yview("scroll", 1, "units") + + def page_up(self, *e): + self._chart_canvas.yview("scroll", -1, "pages") + + def page_down(self, *e): + self._chart_canvas.yview("scroll", 1, "pages") + + def _grow(self): + """ + Grow the window, if necessary + """ + # Grow, if need-be + N = self._chart.num_leaves() + width = max( + int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2 + ) + + # It won't resize without the second (height) line, but I + # don't understand why not. + self._chart_canvas.configure(width=width) + self._chart_canvas.configure(height=self._chart_canvas["height"]) + + self._unitsize = (width - 2 * ChartView._MARGIN) / N + + # Reset the height for the sentence window. + if self._sentence_canvas is not None: + self._sentence_canvas["height"] = self._sentence_height + + def set_font_size(self, size): + self._font.configure(size=-abs(size)) + self._boldfont.configure(size=-abs(size)) + self._sysfont.configure(size=-abs(size)) + self._analyze() + self._grow() + self.draw() + + def get_font_size(self): + return abs(self._fontsize) + + def _configure(self, e): + """ + The configure callback. This is called whenever the window is + resized. It is also called when the window is first mapped. + It figures out the unit size, and redraws the contents of each + canvas. + """ + N = self._chart.num_leaves() + self._unitsize = (e.width - 2 * ChartView._MARGIN) / N + self.draw() + + def update(self, chart=None): + """ + Draw any edges that have not been drawn. This is typically + called when a after modifies the canvas that a CanvasView is + displaying. ``update`` will cause any edges that have been + added to the chart to be drawn. + + If update is given a ``chart`` argument, then it will replace + the current chart with the given chart. + """ + if chart is not None: + self._chart = chart + self._edgelevels = [] + self._marks = {} + self._analyze() + self._grow() + self.draw() + self.erase_tree() + self._resize() + else: + for edge in self._chart: + if edge not in self._edgetags: + self._add_edge(edge) + self._resize() + + def _edge_conflict(self, edge, lvl): + """ + Return True if the given edge overlaps with any edge on the given + level. This is used by _add_edge to figure out what level a + new edge should be added to. + """ + (s1, e1) = edge.span() + for otheredge in self._edgelevels[lvl]: + (s2, e2) = otheredge.span() + if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1 == s2 == e1 == e2): + return True + return False + + def _analyze_edge(self, edge): + """ + Given a new edge, recalculate: + + - _text_height + - _unitsize (if the edge text is too big for the current + _unitsize, then increase _unitsize) + """ + c = self._chart_canvas + + if isinstance(edge, TreeEdge): + lhs = edge.lhs() + rhselts = [] + for elt in edge.rhs(): + if isinstance(elt, Nonterminal): + rhselts.append(str(elt.symbol())) + else: + rhselts.append(repr(elt)) + rhs = " ".join(rhselts) + else: + lhs = edge.lhs() + rhs = "" + + for s in (lhs, rhs): + tag = c.create_text( + 0, 0, text=s, font=self._boldfont, anchor="nw", justify="left" + ) + bbox = c.bbox(tag) + c.delete(tag) + width = bbox[2] # + ChartView._LEAF_SPACING + edgelen = max(edge.length(), 1) + self._unitsize = max(self._unitsize, width / edgelen) + self._text_height = max(self._text_height, bbox[3] - bbox[1]) + + def _add_edge(self, edge, minlvl=0): + """ + Add a single edge to the ChartView: + + - Call analyze_edge to recalculate display parameters + - Find an available level + - Call _draw_edge + """ + # Do NOT show leaf edges in the chart. + if isinstance(edge, LeafEdge): + return + + if edge in self._edgetags: + return + self._analyze_edge(edge) + self._grow() + + if not self._compact: + self._edgelevels.append([edge]) + lvl = len(self._edgelevels) - 1 + self._draw_edge(edge, lvl) + self._resize() + return + + # Figure out what level to draw the edge on. + lvl = 0 + while True: + # If this level doesn't exist yet, create it. + while lvl >= len(self._edgelevels): + self._edgelevels.append([]) + self._resize() + + # Check if we can fit the edge in this level. + if lvl >= minlvl and not self._edge_conflict(edge, lvl): + # Go ahead and draw it. + self._edgelevels[lvl].append(edge) + break + + # Try the next level. + lvl += 1 + + self._draw_edge(edge, lvl) + + def view_edge(self, edge): + level = None + for i in range(len(self._edgelevels)): + if edge in self._edgelevels[i]: + level = i + break + if level is None: + return + # Try to view the new edge.. + y = (level + 1) * self._chart_level_size + dy = self._text_height + 10 + self._chart_canvas.yview("moveto", 1.0) + if self._chart_height != 0: + self._chart_canvas.yview("moveto", (y - dy) / self._chart_height) + + def _draw_edge(self, edge, lvl): + """ + Draw a single edge on the ChartView. + """ + c = self._chart_canvas + + # Draw the arrow. + x1 = edge.start() * self._unitsize + ChartView._MARGIN + x2 = edge.end() * self._unitsize + ChartView._MARGIN + if x2 == x1: + x2 += max(4, self._unitsize / 5) + y = (lvl + 1) * self._chart_level_size + linetag = c.create_line(x1, y, x2, y, arrow="last", width=3) + + # Draw a label for the edge. + if isinstance(edge, TreeEdge): + rhs = [] + for elt in edge.rhs(): + if isinstance(elt, Nonterminal): + rhs.append(str(elt.symbol())) + else: + rhs.append(repr(elt)) + pos = edge.dot() + else: + rhs = [] + pos = 0 + + rhs1 = " ".join(rhs[:pos]) + rhs2 = " ".join(rhs[pos:]) + rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw") + dotx = c.bbox(rhstag1)[2] + 6 + doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2 + dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2) + rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw") + lhstag = c.create_text( + (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont + ) + + # Keep track of the edge's tags. + self._edgetags[edge] = (linetag, rhstag1, dottag, rhstag2, lhstag) + + # Register a callback for clicking on the edge. + def cb(event, self=self, edge=edge): + self._fire_callbacks("select", edge) + + c.tag_bind(rhstag1, "", cb) + c.tag_bind(rhstag2, "", cb) + c.tag_bind(linetag, "", cb) + c.tag_bind(dottag, "", cb) + c.tag_bind(lhstag, "", cb) + + self._color_edge(edge) + + def _color_edge(self, edge, linecolor=None, textcolor=None): + """ + Color in an edge with the given colors. + If no colors are specified, use intelligent defaults + (dependent on selection, etc.) + """ + if edge not in self._edgetags: + return + c = self._chart_canvas + + if linecolor is not None and textcolor is not None: + if edge in self._marks: + linecolor = self._marks[edge] + tags = self._edgetags[edge] + c.itemconfig(tags[0], fill=linecolor) + c.itemconfig(tags[1], fill=textcolor) + c.itemconfig(tags[2], fill=textcolor, outline=textcolor) + c.itemconfig(tags[3], fill=textcolor) + c.itemconfig(tags[4], fill=textcolor) + return + else: + N = self._chart.num_leaves() + if edge in self._marks: + self._color_edge(self._marks[edge]) + if edge.is_complete() and edge.span() == (0, N): + self._color_edge(edge, "#084", "#042") + elif isinstance(edge, LeafEdge): + self._color_edge(edge, "#48c", "#246") + else: + self._color_edge(edge, "#00f", "#008") + + def mark_edge(self, edge, mark="#0df"): + """ + Mark an edge + """ + self._marks[edge] = mark + self._color_edge(edge) + + def unmark_edge(self, edge=None): + """ + Unmark an edge (or all edges) + """ + if edge is None: + old_marked_edges = list(self._marks.keys()) + self._marks = {} + for edge in old_marked_edges: + self._color_edge(edge) + else: + del self._marks[edge] + self._color_edge(edge) + + def markonly_edge(self, edge, mark="#0df"): + self.unmark_edge() + self.mark_edge(edge, mark) + + def _analyze(self): + """ + Analyze the sentence string, to figure out how big a unit needs + to be, How big the tree should be, etc. + """ + # Figure out the text height and the unit size. + unitsize = 70 # min unitsize + text_height = 0 + c = self._chart_canvas + + # Check against all tokens + for leaf in self._chart.leaves(): + tag = c.create_text( + 0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left" + ) + bbox = c.bbox(tag) + c.delete(tag) + width = bbox[2] + ChartView._LEAF_SPACING + unitsize = max(width, unitsize) + text_height = max(text_height, bbox[3] - bbox[1]) + + self._unitsize = unitsize + self._text_height = text_height + self._sentence_height = self._text_height + 2 * ChartView._MARGIN + + # Check against edges. + for edge in self._chart.edges(): + self._analyze_edge(edge) + + # Size of chart levels + self._chart_level_size = self._text_height * 2 + + # Default tree size.. + self._tree_height = 3 * (ChartView._TREE_LEVEL_SIZE + self._text_height) + + # Resize the scrollregions. + self._resize() + + def _resize(self): + """ + Update the scroll-regions for each canvas. This ensures that + everything is within a scroll-region, so the user can use the + scrollbars to view the entire display. This does *not* + resize the window. + """ + c = self._chart_canvas + + # Reset the chart scroll region + width = self._chart.num_leaves() * self._unitsize + ChartView._MARGIN * 2 + + levels = len(self._edgelevels) + self._chart_height = (levels + 2) * self._chart_level_size + c["scrollregion"] = (0, 0, width, self._chart_height) + + # Reset the tree scroll region + if self._tree_canvas: + self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height) + + def _draw_loclines(self): + """ + Draw location lines. These are vertical gridlines used to + show where each location unit is. + """ + BOTTOM = 50000 + c1 = self._tree_canvas + c2 = self._sentence_canvas + c3 = self._chart_canvas + margin = ChartView._MARGIN + self._loclines = [] + for i in range(0, self._chart.num_leaves() + 1): + x = i * self._unitsize + margin + + if c1: + t1 = c1.create_line(x, 0, x, BOTTOM) + c1.tag_lower(t1) + if c2: + t2 = c2.create_line(x, 0, x, self._sentence_height) + c2.tag_lower(t2) + t3 = c3.create_line(x, 0, x, BOTTOM) + c3.tag_lower(t3) + t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font) + c3.tag_lower(t4) + # if i % 4 == 0: + # if c1: c1.itemconfig(t1, width=2, fill='gray60') + # if c2: c2.itemconfig(t2, width=2, fill='gray60') + # c3.itemconfig(t3, width=2, fill='gray60') + if i % 2 == 0: + if c1: + c1.itemconfig(t1, fill="gray60") + if c2: + c2.itemconfig(t2, fill="gray60") + c3.itemconfig(t3, fill="gray60") + else: + if c1: + c1.itemconfig(t1, fill="gray80") + if c2: + c2.itemconfig(t2, fill="gray80") + c3.itemconfig(t3, fill="gray80") + + def _draw_sentence(self): + """Draw the sentence string.""" + if self._chart.num_leaves() == 0: + return + c = self._sentence_canvas + margin = ChartView._MARGIN + y = ChartView._MARGIN + + for i, leaf in enumerate(self._chart.leaves()): + x1 = i * self._unitsize + margin + x2 = x1 + self._unitsize + x = (x1 + x2) / 2 + tag = c.create_text( + x, y, text=repr(leaf), font=self._font, anchor="n", justify="left" + ) + bbox = c.bbox(tag) + rt = c.create_rectangle( + x1 + 2, + bbox[1] - (ChartView._LEAF_SPACING / 2), + x2 - 2, + bbox[3] + (ChartView._LEAF_SPACING / 2), + fill="#f0f0f0", + outline="#f0f0f0", + ) + c.tag_lower(rt) + + def erase_tree(self): + for tag in self._tree_tags: + self._tree_canvas.delete(tag) + self._treetoks = [] + self._treetoks_edge = None + self._treetoks_index = 0 + + def draw_tree(self, edge=None): + if edge is None and self._treetoks_edge is None: + return + if edge is None: + edge = self._treetoks_edge + + # If it's a new edge, then get a new list of treetoks. + if self._treetoks_edge != edge: + self._treetoks = [t for t in self._chart.trees(edge) if isinstance(t, Tree)] + self._treetoks_edge = edge + self._treetoks_index = 0 + + # Make sure there's something to draw. + if len(self._treetoks) == 0: + return + + # Erase the old tree. + for tag in self._tree_tags: + self._tree_canvas.delete(tag) + + # Draw the new tree. + tree = self._treetoks[self._treetoks_index] + self._draw_treetok(tree, edge.start()) + + # Show how many trees are available for the edge. + self._draw_treecycle() + + # Update the scroll region. + w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN + h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height) + self._tree_canvas["scrollregion"] = (0, 0, w, h) + + def cycle_tree(self): + self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks) + self.draw_tree(self._treetoks_edge) + + def _draw_treecycle(self): + if len(self._treetoks) <= 1: + return + + # Draw the label. + label = "%d Trees" % len(self._treetoks) + c = self._tree_canvas + margin = ChartView._MARGIN + right = self._chart.num_leaves() * self._unitsize + margin - 2 + tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont) + self._tree_tags.append(tag) + _, _, _, y = c.bbox(tag) + + # Draw the triangles. + for i in range(len(self._treetoks)): + x = right - 20 * (len(self._treetoks) - i - 1) + if i == self._treetoks_index: + fill = "#084" + else: + fill = "#fff" + tag = c.create_polygon( + x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black" + ) + self._tree_tags.append(tag) + + # Set up a callback: show the tree if they click on its + # triangle. + def cb(event, self=self, i=i): + self._treetoks_index = i + self.draw_tree() + + c.tag_bind(tag, "", cb) + + def _draw_treetok(self, treetok, index, depth=0): + """ + :param index: The index of the first leaf in the tree. + :return: The index of the first leaf after the tree. + """ + c = self._tree_canvas + margin = ChartView._MARGIN + + # Draw the children + child_xs = [] + for child in treetok: + if isinstance(child, Tree): + child_x, index = self._draw_treetok(child, index, depth + 1) + child_xs.append(child_x) + else: + child_xs.append((2 * index + 1) * self._unitsize / 2 + margin) + index += 1 + + # If we have children, then get the node's x by averaging their + # node x's. Otherwise, make room for ourselves. + if child_xs: + nodex = sum(child_xs) / len(child_xs) + else: + # [XX] breaks for null productions. + nodex = (2 * index + 1) * self._unitsize / 2 + margin + index += 1 + + # Draw the node + nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height) + tag = c.create_text( + nodex, + nodey, + anchor="n", + justify="center", + text=str(treetok.label()), + fill="#042", + font=self._boldfont, + ) + self._tree_tags.append(tag) + + # Draw lines to the children. + childy = nodey + ChartView._TREE_LEVEL_SIZE + self._text_height + for childx, child in zip(child_xs, treetok): + if isinstance(child, Tree) and child: + # A "real" tree token: + tag = c.create_line( + nodex, + nodey + self._text_height, + childx, + childy, + width=2, + fill="#084", + ) + self._tree_tags.append(tag) + if isinstance(child, Tree) and not child: + # An unexpanded tree token: + tag = c.create_line( + nodex, + nodey + self._text_height, + childx, + childy, + width=2, + fill="#048", + dash="2 3", + ) + self._tree_tags.append(tag) + if not isinstance(child, Tree): + # A leaf: + tag = c.create_line( + nodex, + nodey + self._text_height, + childx, + 10000, + width=2, + fill="#084", + ) + self._tree_tags.append(tag) + + return nodex, index + + def draw(self): + """ + Draw everything (from scratch). + """ + if self._tree_canvas: + self._tree_canvas.delete("all") + self.draw_tree() + + if self._sentence_canvas: + self._sentence_canvas.delete("all") + self._draw_sentence() + + self._chart_canvas.delete("all") + self._edgetags = {} + + # Redraw any edges we erased. + for lvl in range(len(self._edgelevels)): + for edge in self._edgelevels[lvl]: + self._draw_edge(edge, lvl) + + for edge in self._chart: + self._add_edge(edge) + + self._draw_loclines() + + def add_callback(self, event, func): + self._callbacks.setdefault(event, {})[func] = 1 + + def remove_callback(self, event, func=None): + if func is None: + del self._callbacks[event] + else: + try: + del self._callbacks[event][func] + except: + pass + + def _fire_callbacks(self, event, *args): + if event not in self._callbacks: + return + for cb_func in list(self._callbacks[event].keys()): + cb_func(*args) + + +####################################################################### +# Edge Rules +####################################################################### +# These version of the chart rules only apply to a specific edge. +# This lets the user select an edge, and then apply a rule. + + +class EdgeRule: + """ + To create an edge rule, make an empty base class that uses + EdgeRule as the first base class, and the basic rule as the + second base class. (Order matters!) + """ + + def __init__(self, edge): + super = self.__class__.__bases__[1] + self._edge = edge + self.NUM_EDGES = super.NUM_EDGES - 1 + + def apply(self, chart, grammar, *edges): + super = self.__class__.__bases__[1] + edges += (self._edge,) + yield from super.apply(self, chart, grammar, *edges) + + def __str__(self): + super = self.__class__.__bases__[1] + return super.__str__(self) + + +class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule): + pass + + +class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule): + pass + + +class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule): + pass + + +class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule): + pass + + +####################################################################### +# Chart Parser Application +####################################################################### + + +class ChartParserApp: + def __init__(self, grammar, tokens, title="Chart Parser Application"): + # Initialize the parser + self._init_parser(grammar, tokens) + + self._root = None + try: + # Create the root window. + self._root = Tk() + self._root.title(title) + self._root.bind("", self.destroy) + + # Set up some frames. + frame3 = Frame(self._root) + frame2 = Frame(self._root) + frame1 = Frame(self._root) + frame3.pack(side="bottom", fill="none") + frame2.pack(side="bottom", fill="x") + frame1.pack(side="bottom", fill="both", expand=1) + + self._init_fonts(self._root) + self._init_animation() + self._init_chartview(frame1) + self._init_rulelabel(frame2) + self._init_buttons(frame3) + self._init_menubar() + + self._matrix = None + self._results = None + + # Set up keyboard bindings. + self._init_bindings() + + except: + print("Error creating Tree View") + self.destroy() + raise + + def destroy(self, *args): + if self._root is None: + return + self._root.destroy() + self._root = None + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self._root.mainloop(*args, **kwargs) + + # //////////////////////////////////////////////////////////// + # Initialization Helpers + # //////////////////////////////////////////////////////////// + + def _init_parser(self, grammar, tokens): + self._grammar = grammar + self._tokens = tokens + self._reset_parser() + + def _reset_parser(self): + self._cp = SteppingChartParser(self._grammar) + self._cp.initialize(self._tokens) + self._chart = self._cp.chart() + + # Insert LeafEdges before the parsing starts. + for _new_edge in LeafInitRule().apply(self._chart, self._grammar): + pass + + # The step iterator -- use this to generate new edges + self._cpstep = self._cp.step() + + # The currently selected edge + self._selection = None + + def _init_fonts(self, root): + # See: + self._sysfont = Font(font=Button()["font"]) + root.option_add("*Font", self._sysfont) + + # TWhat's our font size (default=same as sysfont) + self._size = IntVar(root) + self._size.set(self._sysfont.cget("size")) + + self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) + self._font = Font(family="helvetica", size=self._size.get()) + + def _init_animation(self): + # Are we stepping? (default=yes) + self._step = IntVar(self._root) + self._step.set(1) + + # What's our animation speed (default=fast) + self._animate = IntVar(self._root) + self._animate.set(3) # Default speed = fast + + # Are we currently animating? + self._animating = 0 + + def _init_chartview(self, parent): + self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1) + self._cv.add_callback("select", self._click_cv_edge) + + def _init_rulelabel(self, parent): + ruletxt = "Last edge generated by:" + + self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont) + self._rulelabel2 = Label( + parent, width=40, relief="groove", anchor="w", font=self._boldfont + ) + self._rulelabel1.pack(side="left") + self._rulelabel2.pack(side="left") + step = Checkbutton(parent, variable=self._step, text="Step") + step.pack(side="right") + + def _init_buttons(self, parent): + frame1 = Frame(parent) + frame2 = Frame(parent) + frame1.pack(side="bottom", fill="x") + frame2.pack(side="top", fill="none") + + Button( + frame1, + text="Reset\nParser", + background="#90c0d0", + foreground="black", + command=self.reset, + ).pack(side="right") + # Button(frame1, text='Pause', + # background='#90c0d0', foreground='black', + # command=self.pause).pack(side='left') + + Button( + frame1, + text="Top Down\nStrategy", + background="#90c0d0", + foreground="black", + command=self.top_down_strategy, + ).pack(side="left") + Button( + frame1, + text="Bottom Up\nStrategy", + background="#90c0d0", + foreground="black", + command=self.bottom_up_strategy, + ).pack(side="left") + Button( + frame1, + text="Bottom Up\nLeft-Corner Strategy", + background="#90c0d0", + foreground="black", + command=self.bottom_up_leftcorner_strategy, + ).pack(side="left") + + Button( + frame2, + text="Top Down Init\nRule", + background="#90f090", + foreground="black", + command=self.top_down_init, + ).pack(side="left") + Button( + frame2, + text="Top Down Predict\nRule", + background="#90f090", + foreground="black", + command=self.top_down_predict, + ).pack(side="left") + Frame(frame2, width=20).pack(side="left") + + Button( + frame2, + text="Bottom Up Predict\nRule", + background="#90f090", + foreground="black", + command=self.bottom_up, + ).pack(side="left") + Frame(frame2, width=20).pack(side="left") + + Button( + frame2, + text="Bottom Up Left-Corner\nPredict Rule", + background="#90f090", + foreground="black", + command=self.bottom_up_leftcorner, + ).pack(side="left") + Frame(frame2, width=20).pack(side="left") + + Button( + frame2, + text="Fundamental\nRule", + background="#90f090", + foreground="black", + command=self.fundamental, + ).pack(side="left") + + def _init_bindings(self): + self._root.bind("", self._cv.scroll_up) + self._root.bind("", self._cv.scroll_down) + self._root.bind("", self._cv.page_up) + self._root.bind("", self._cv.page_down) + self._root.bind("", self.destroy) + self._root.bind("", self.destroy) + self._root.bind("", self.help) + + self._root.bind("", self.save_chart) + self._root.bind("", self.load_chart) + self._root.bind("", self.reset) + + self._root.bind("t", self.top_down_strategy) + self._root.bind("b", self.bottom_up_strategy) + self._root.bind("c", self.bottom_up_leftcorner_strategy) + self._root.bind("", self._stop_animation) + + self._root.bind("", self.edit_grammar) + self._root.bind("", self.edit_sentence) + + # Animation speed control + self._root.bind("-", lambda e, a=self._animate: a.set(1)) + self._root.bind("=", lambda e, a=self._animate: a.set(2)) + self._root.bind("+", lambda e, a=self._animate: a.set(3)) + + # Step control + self._root.bind("s", lambda e, s=self._step: s.set(not s.get())) + + def _init_menubar(self): + menubar = Menu(self._root) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Save Chart", + underline=0, + command=self.save_chart, + accelerator="Ctrl-s", + ) + filemenu.add_command( + label="Load Chart", + underline=0, + command=self.load_chart, + accelerator="Ctrl-o", + ) + filemenu.add_command( + label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r" + ) + filemenu.add_separator() + filemenu.add_command(label="Save Grammar", command=self.save_grammar) + filemenu.add_command(label="Load Grammar", command=self.load_grammar) + filemenu.add_separator() + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + editmenu = Menu(menubar, tearoff=0) + editmenu.add_command( + label="Edit Grammar", + underline=5, + command=self.edit_grammar, + accelerator="Ctrl-g", + ) + editmenu.add_command( + label="Edit Text", + underline=5, + command=self.edit_sentence, + accelerator="Ctrl-t", + ) + menubar.add_cascade(label="Edit", underline=0, menu=editmenu) + + viewmenu = Menu(menubar, tearoff=0) + viewmenu.add_command( + label="Chart Matrix", underline=6, command=self.view_matrix + ) + viewmenu.add_command(label="Results", underline=0, command=self.view_results) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + rulemenu = Menu(menubar, tearoff=0) + rulemenu.add_command( + label="Top Down Strategy", + underline=0, + command=self.top_down_strategy, + accelerator="t", + ) + rulemenu.add_command( + label="Bottom Up Strategy", + underline=0, + command=self.bottom_up_strategy, + accelerator="b", + ) + rulemenu.add_command( + label="Bottom Up Left-Corner Strategy", + underline=0, + command=self.bottom_up_leftcorner_strategy, + accelerator="c", + ) + rulemenu.add_separator() + rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up) + rulemenu.add_command( + label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner + ) + rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init) + rulemenu.add_command( + label="Top Down Predict Rule", command=self.top_down_predict + ) + rulemenu.add_command(label="Fundamental Rule", command=self.fundamental) + menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) + + animatemenu = Menu(menubar, tearoff=0) + animatemenu.add_checkbutton( + label="Step", underline=0, variable=self._step, accelerator="s" + ) + animatemenu.add_separator() + animatemenu.add_radiobutton( + label="No Animation", underline=0, variable=self._animate, value=0 + ) + animatemenu.add_radiobutton( + label="Slow Animation", + underline=0, + variable=self._animate, + value=1, + accelerator="-", + ) + animatemenu.add_radiobutton( + label="Normal Animation", + underline=0, + variable=self._animate, + value=2, + accelerator="=", + ) + animatemenu.add_radiobutton( + label="Fast Animation", + underline=0, + variable=self._animate, + value=3, + accelerator="+", + ) + menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) + + zoommenu = Menu(menubar, tearoff=0) + zoommenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=12, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=14, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=18, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=24, + command=self.resize, + ) + menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + helpmenu.add_command( + label="Instructions", underline=0, command=self.help, accelerator="F1" + ) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + + self._root.config(menu=menubar) + + # //////////////////////////////////////////////////////////// + # Selection Handling + # //////////////////////////////////////////////////////////// + + def _click_cv_edge(self, edge): + if edge != self._selection: + # Clicking on a new edge selects it. + self._select_edge(edge) + else: + # Repeated clicks on one edge cycle its trees. + self._cv.cycle_tree() + # [XX] this can get confused if animation is running + # faster than the callbacks... + + def _select_matrix_edge(self, edge): + self._select_edge(edge) + self._cv.view_edge(edge) + + def _select_edge(self, edge): + self._selection = edge + # Update the chart view. + self._cv.markonly_edge(edge, "#f00") + self._cv.draw_tree(edge) + # Update the matrix view. + if self._matrix: + self._matrix.markonly_edge(edge) + if self._matrix: + self._matrix.view_edge(edge) + + def _deselect_edge(self): + self._selection = None + # Update the chart view. + self._cv.unmark_edge() + self._cv.erase_tree() + # Update the matrix view + if self._matrix: + self._matrix.unmark_edge() + + def _show_new_edge(self, edge): + self._display_rule(self._cp.current_chartrule()) + # Update the chart view. + self._cv.update() + self._cv.draw_tree(edge) + self._cv.markonly_edge(edge, "#0df") + self._cv.view_edge(edge) + # Update the matrix view. + if self._matrix: + self._matrix.update() + if self._matrix: + self._matrix.markonly_edge(edge) + if self._matrix: + self._matrix.view_edge(edge) + # Update the results view. + if self._results: + self._results.update(edge) + + # //////////////////////////////////////////////////////////// + # Help/usage + # //////////////////////////////////////////////////////////// + + def help(self, *e): + self._animating = 0 + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self._root, + "Help: Chart Parser Application", + (__doc__ or "").strip(), + width=75, + font="fixed", + ) + except: + ShowText( + self._root, + "Help: Chart Parser Application", + (__doc__ or "").strip(), + width=75, + ) + + def about(self, *e): + ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper" + showinfo("About: Chart Parser Application", ABOUT) + + # //////////////////////////////////////////////////////////// + # File Menu + # //////////////////////////////////////////////////////////// + + CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] + GRAMMAR_FILE_TYPES = [ + ("Plaintext grammar file", ".cfg"), + ("Pickle file", ".pickle"), + ("All files", "*"), + ] + + def load_chart(self, *args): + "Load a chart from a pickle file" + filename = askopenfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) + if not filename: + return + try: + with open(filename, "rb") as infile: + chart = pickle.load(infile) + self._chart = chart + self._cv.update(chart) + if self._matrix: + self._matrix.set_chart(chart) + if self._matrix: + self._matrix.deselect_cell() + if self._results: + self._results.set_chart(chart) + self._cp.set_chart(chart) + except Exception as e: + raise + showerror("Error Loading Chart", "Unable to open file: %r" % filename) + + def save_chart(self, *args): + "Save a chart to a pickle file" + filename = asksaveasfilename( + filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" + ) + if not filename: + return + try: + with open(filename, "wb") as outfile: + pickle.dump(self._chart, outfile) + except Exception as e: + raise + showerror("Error Saving Chart", "Unable to open file: %r" % filename) + + def load_grammar(self, *args): + "Load a grammar from a pickle file" + filename = askopenfilename( + filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" + ) + if not filename: + return + try: + if filename.endswith(".pickle"): + with open(filename, "rb") as infile: + grammar = pickle.load(infile) + else: + with open(filename) as infile: + grammar = CFG.fromstring(infile.read()) + self.set_grammar(grammar) + except Exception as e: + showerror("Error Loading Grammar", "Unable to open file: %r" % filename) + + def save_grammar(self, *args): + filename = asksaveasfilename( + filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" + ) + if not filename: + return + try: + if filename.endswith(".pickle"): + with open(filename, "wb") as outfile: + pickle.dump((self._chart, self._tokens), outfile) + else: + with open(filename, "w") as outfile: + prods = self._grammar.productions() + start = [p for p in prods if p.lhs() == self._grammar.start()] + rest = [p for p in prods if p.lhs() != self._grammar.start()] + for prod in start: + outfile.write("%s\n" % prod) + for prod in rest: + outfile.write("%s\n" % prod) + except Exception as e: + showerror("Error Saving Grammar", "Unable to open file: %r" % filename) + + def reset(self, *args): + self._animating = 0 + self._reset_parser() + self._cv.update(self._chart) + if self._matrix: + self._matrix.set_chart(self._chart) + if self._matrix: + self._matrix.deselect_cell() + if self._results: + self._results.set_chart(self._chart) + + # //////////////////////////////////////////////////////////// + # Edit + # //////////////////////////////////////////////////////////// + + def edit_grammar(self, *e): + CFGEditor(self._root, self._grammar, self.set_grammar) + + def set_grammar(self, grammar): + self._grammar = grammar + self._cp.set_grammar(grammar) + if self._results: + self._results.set_grammar(grammar) + + def edit_sentence(self, *e): + sentence = " ".join(self._tokens) + title = "Edit Text" + instr = "Enter a new sentence to parse." + EntryDialog(self._root, sentence, instr, self.set_sentence, title) + + def set_sentence(self, sentence): + self._tokens = list(sentence.split()) + self.reset() + + # //////////////////////////////////////////////////////////// + # View Menu + # //////////////////////////////////////////////////////////// + + def view_matrix(self, *e): + if self._matrix is not None: + self._matrix.destroy() + self._matrix = ChartMatrixView(self._root, self._chart) + self._matrix.add_callback("select", self._select_matrix_edge) + + def view_results(self, *e): + if self._results is not None: + self._results.destroy() + self._results = ChartResultsView(self._root, self._chart, self._grammar) + + # //////////////////////////////////////////////////////////// + # Zoom Menu + # //////////////////////////////////////////////////////////// + + def resize(self): + self._animating = 0 + self.set_font_size(self._size.get()) + + def set_font_size(self, size): + self._cv.set_font_size(size) + self._font.configure(size=-abs(size)) + self._boldfont.configure(size=-abs(size)) + self._sysfont.configure(size=-abs(size)) + + def get_font_size(self): + return abs(self._size.get()) + + # //////////////////////////////////////////////////////////// + # Parsing + # //////////////////////////////////////////////////////////// + + def apply_strategy(self, strategy, edge_strategy=None): + # If we're animating, then stop. + if self._animating: + self._animating = 0 + return + + # Clear the rule display & mark. + self._display_rule(None) + # self._cv.unmark_edge() + + if self._step.get(): + selection = self._selection + if (selection is not None) and (edge_strategy is not None): + # Apply the given strategy to the selected edge. + self._cp.set_strategy([edge_strategy(selection)]) + newedge = self._apply_strategy() + + # If it failed, then clear the selection. + if newedge is None: + self._cv.unmark_edge() + self._selection = None + else: + self._cp.set_strategy(strategy) + self._apply_strategy() + + else: + self._cp.set_strategy(strategy) + if self._animate.get(): + self._animating = 1 + self._animate_strategy() + else: + for edge in self._cpstep: + if edge is None: + break + self._cv.update() + if self._matrix: + self._matrix.update() + if self._results: + self._results.update() + + def _stop_animation(self, *e): + self._animating = 0 + + def _animate_strategy(self, speed=1): + if self._animating == 0: + return + if self._apply_strategy() is not None: + if self._animate.get() == 0 or self._step.get() == 1: + return + if self._animate.get() == 1: + self._root.after(3000, self._animate_strategy) + elif self._animate.get() == 2: + self._root.after(1000, self._animate_strategy) + else: + self._root.after(20, self._animate_strategy) + + def _apply_strategy(self): + new_edge = next(self._cpstep) + + if new_edge is not None: + self._show_new_edge(new_edge) + return new_edge + + def _display_rule(self, rule): + if rule is None: + self._rulelabel2["text"] = "" + else: + name = str(rule) + self._rulelabel2["text"] = name + size = self._cv.get_font_size() + + # //////////////////////////////////////////////////////////// + # Parsing Strategies + # //////////////////////////////////////////////////////////// + + # Basic rules: + _TD_INIT = [TopDownInitRule()] + _TD_PREDICT = [TopDownPredictRule()] + _BU_RULE = [BottomUpPredictRule()] + _BU_LC_RULE = [BottomUpPredictCombineRule()] + _FUNDAMENTAL = [SingleEdgeFundamentalRule()] + + # Complete strategies: + _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL + _BU_STRATEGY = _BU_RULE + _FUNDAMENTAL + _BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL + + # Button callback functions: + def top_down_init(self, *e): + self.apply_strategy(self._TD_INIT, None) + + def top_down_predict(self, *e): + self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule) + + def bottom_up(self, *e): + self.apply_strategy(self._BU_RULE, BottomUpEdgeRule) + + def bottom_up_leftcorner(self, *e): + self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule) + + def fundamental(self, *e): + self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule) + + def bottom_up_strategy(self, *e): + self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule) + + def bottom_up_leftcorner_strategy(self, *e): + self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule) + + def top_down_strategy(self, *e): + self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule) + + +def app(): + grammar = CFG.fromstring( + """ + # Grammatical productions. + S -> NP VP + VP -> VP PP | V NP | V + NP -> Det N | NP PP + PP -> P NP + # Lexical productions. + NP -> 'John' | 'I' + Det -> 'the' | 'my' | 'a' + N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork' + V -> 'ate' | 'saw' + P -> 'on' | 'under' | 'with' + """ + ) + + sent = "John ate the cake on the table with a fork" + sent = "John ate the cake on the table" + tokens = list(sent.split()) + + print("grammar= (") + for rule in grammar.productions(): + print((" ", repr(rule) + ",")) + print(")") + print("tokens = %r" % tokens) + print('Calling "ChartParserApp(grammar, tokens)"...') + ChartParserApp(grammar, tokens).mainloop() + + +if __name__ == "__main__": + app() + + # Chart comparer: + # charts = ['/tmp/earley.pickle', + # '/tmp/topdown.pickle', + # '/tmp/bottomup.pickle'] + # ChartComparer(*charts).mainloop() + + # import profile + # profile.run('demo2()', '/tmp/profile.out') + # import pstats + # p = pstats.Stats('/tmp/profile.out') + # p.strip_dirs().sort_stats('time', 'cum').print_stats(60) + # p.strip_dirs().sort_stats('cum', 'time').print_stats(60) + +__all__ = ["app"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/book.py b/.eggs/nltk-3.8-py3.10.egg/nltk/book.py new file mode 100644 index 0000000000000000000000000000000000000000..7f79e9b60a61a9c7a297992522de0c18d950f8e6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/book.py @@ -0,0 +1,213 @@ +# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# +# URL: +# For license information, see LICENSE.TXT + +from nltk.corpus import ( + genesis, + gutenberg, + inaugural, + nps_chat, + treebank, + webtext, + wordnet, +) +from nltk.probability import FreqDist +from nltk.text import Text +from nltk.util import bigrams + +print("*** Introductory Examples for the NLTK Book ***") +print("Loading text1, ..., text9 and sent1, ..., sent9") +print("Type the name of the text or sentence to view it.") +print("Type: 'texts()' or 'sents()' to list the materials.") + +text1 = Text(gutenberg.words("melville-moby_dick.txt")) +print("text1:", text1.name) + +text2 = Text(gutenberg.words("austen-sense.txt")) +print("text2:", text2.name) + +text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") +print("text3:", text3.name) + +text4 = Text(inaugural.words(), name="Inaugural Address Corpus") +print("text4:", text4.name) + +text5 = Text(nps_chat.words(), name="Chat Corpus") +print("text5:", text5.name) + +text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") +print("text6:", text6.name) + +text7 = Text(treebank.words(), name="Wall Street Journal") +print("text7:", text7.name) + +text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") +print("text8:", text8.name) + +text9 = Text(gutenberg.words("chesterton-thursday.txt")) +print("text9:", text9.name) + + +def texts(): + print("text1:", text1.name) + print("text2:", text2.name) + print("text3:", text3.name) + print("text4:", text4.name) + print("text5:", text5.name) + print("text6:", text6.name) + print("text7:", text7.name) + print("text8:", text8.name) + print("text9:", text9.name) + + +sent1 = ["Call", "me", "Ishmael", "."] +sent2 = [ + "The", + "family", + "of", + "Dashwood", + "had", + "long", + "been", + "settled", + "in", + "Sussex", + ".", +] +sent3 = [ + "In", + "the", + "beginning", + "God", + "created", + "the", + "heaven", + "and", + "the", + "earth", + ".", +] +sent4 = [ + "Fellow", + "-", + "Citizens", + "of", + "the", + "Senate", + "and", + "of", + "the", + "House", + "of", + "Representatives", + ":", +] +sent5 = [ + "I", + "have", + "a", + "problem", + "with", + "people", + "PMing", + "me", + "to", + "lol", + "JOIN", +] +sent6 = [ + "SCENE", + "1", + ":", + "[", + "wind", + "]", + "[", + "clop", + "clop", + "clop", + "]", + "KING", + "ARTHUR", + ":", + "Whoa", + "there", + "!", +] +sent7 = [ + "Pierre", + "Vinken", + ",", + "61", + "years", + "old", + ",", + "will", + "join", + "the", + "board", + "as", + "a", + "nonexecutive", + "director", + "Nov.", + "29", + ".", +] +sent8 = [ + "25", + "SEXY", + "MALE", + ",", + "seeks", + "attrac", + "older", + "single", + "lady", + ",", + "for", + "discreet", + "encounters", + ".", +] +sent9 = [ + "THE", + "suburb", + "of", + "Saffron", + "Park", + "lay", + "on", + "the", + "sunset", + "side", + "of", + "London", + ",", + "as", + "red", + "and", + "ragged", + "as", + "a", + "cloud", + "of", + "sunset", + ".", +] + + +def sents(): + print("sent1:", " ".join(sent1)) + print("sent2:", " ".join(sent2)) + print("sent3:", " ".join(sent3)) + print("sent4:", " ".join(sent4)) + print("sent5:", " ".join(sent5)) + print("sent6:", " ".join(sent6)) + print("sent7:", " ".join(sent7)) + print("sent8:", " ".join(sent8)) + print("sent9:", " ".join(sent9)) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/rte_classify.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/rte_classify.py new file mode 100644 index 0000000000000000000000000000000000000000..9642765625e2431fa2f9805c5b1463ada74ab4a1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/rte_classify.py @@ -0,0 +1,183 @@ +# Natural Language Toolkit: RTE Classifier +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +Simple classifier for RTE corpus. + +It calculates the overlap in words and named entities between text and +hypothesis, and also whether there are words / named entities in the +hypothesis which fail to occur in the text, since this is an indicator that +the hypothesis is more informative than (i.e not entailed by) the text. + +TO DO: better Named Entity classification +TO DO: add lemmatization +""" + +from nltk.classify.maxent import MaxentClassifier +from nltk.classify.util import accuracy +from nltk.tokenize import RegexpTokenizer + + +class RTEFeatureExtractor: + """ + This builds a bag of words for both the text and the hypothesis after + throwing away some stopwords, then calculates overlap and difference. + """ + + def __init__(self, rtepair, stop=True, use_lemmatize=False): + """ + :param rtepair: a ``RTEPair`` from which features should be extracted + :param stop: if ``True``, stopwords are thrown away. + :type stop: bool + """ + self.stop = stop + self.stopwords = { + "a", + "the", + "it", + "they", + "of", + "in", + "to", + "is", + "have", + "are", + "were", + "and", + "very", + ".", + ",", + } + + self.negwords = {"no", "not", "never", "failed", "rejected", "denied"} + # Try to tokenize so that abbreviations, monetary amounts, email + # addresses, URLs are single tokens. + tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+") + + # Get the set of word types for text and hypothesis + self.text_tokens = tokenizer.tokenize(rtepair.text) + self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) + self.text_words = set(self.text_tokens) + self.hyp_words = set(self.hyp_tokens) + + if use_lemmatize: + self.text_words = {self._lemmatize(token) for token in self.text_tokens} + self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens} + + if self.stop: + self.text_words = self.text_words - self.stopwords + self.hyp_words = self.hyp_words - self.stopwords + + self._overlap = self.hyp_words & self.text_words + self._hyp_extra = self.hyp_words - self.text_words + self._txt_extra = self.text_words - self.hyp_words + + def overlap(self, toktype, debug=False): + """ + Compute the overlap between text and hypothesis. + + :param toktype: distinguish Named Entities from ordinary words + :type toktype: 'ne' or 'word' + """ + ne_overlap = {token for token in self._overlap if self._ne(token)} + if toktype == "ne": + if debug: + print("ne overlap", ne_overlap) + return ne_overlap + elif toktype == "word": + if debug: + print("word overlap", self._overlap - ne_overlap) + return self._overlap - ne_overlap + else: + raise ValueError("Type not recognized:'%s'" % toktype) + + def hyp_extra(self, toktype, debug=True): + """ + Compute the extraneous material in the hypothesis. + + :param toktype: distinguish Named Entities from ordinary words + :type toktype: 'ne' or 'word' + """ + ne_extra = {token for token in self._hyp_extra if self._ne(token)} + if toktype == "ne": + return ne_extra + elif toktype == "word": + return self._hyp_extra - ne_extra + else: + raise ValueError("Type not recognized: '%s'" % toktype) + + @staticmethod + def _ne(token): + """ + This just assumes that words in all caps or titles are + named entities. + + :type token: str + """ + if token.istitle() or token.isupper(): + return True + return False + + @staticmethod + def _lemmatize(word): + """ + Use morphy from WordNet to find the base form of verbs. + """ + from nltk.corpus import wordnet as wn + + lemma = wn.morphy(word, pos=wn.VERB) + if lemma is not None: + return lemma + return word + + +def rte_features(rtepair): + extractor = RTEFeatureExtractor(rtepair) + features = {} + features["alwayson"] = True + features["word_overlap"] = len(extractor.overlap("word")) + features["word_hyp_extra"] = len(extractor.hyp_extra("word")) + features["ne_overlap"] = len(extractor.overlap("ne")) + features["ne_hyp_extra"] = len(extractor.hyp_extra("ne")) + features["neg_txt"] = len(extractor.negwords & extractor.text_words) + features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words) + return features + + +def rte_featurize(rte_pairs): + return [(rte_features(pair), pair.value) for pair in rte_pairs] + + +def rte_classifier(algorithm, sample_N=None): + from nltk.corpus import rte as rte_corpus + + train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) + test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) + + if sample_N is not None: + train_set = train_set[:sample_N] + test_set = test_set[:sample_N] + + featurized_train_set = rte_featurize(train_set) + featurized_test_set = rte_featurize(test_set) + + # Train the classifier + print("Training classifier...") + if algorithm in ["megam"]: # MEGAM based algorithms. + clf = MaxentClassifier.train(featurized_train_set, algorithm) + elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm + clf = MaxentClassifier.train(featurized_train_set, algorithm) + else: + err_msg = str( + "RTEClassifier only supports these algorithms:\n " + "'megam', 'GIS', 'IIS'.\n" + ) + raise Exception(err_msg) + print("Testing classifier...") + acc = accuracy(clf, featurized_test_set) + print("Accuracy: %6.4f" % acc) + return clf diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/scikitlearn.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/scikitlearn.py new file mode 100644 index 0000000000000000000000000000000000000000..c1a35a416e2aebc873dad0559b75f85be3ad8200 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/scikitlearn.py @@ -0,0 +1,143 @@ +# Natural Language Toolkit: Interface to scikit-learn classifiers +# +# Author: Lars Buitinck +# URL: +# For license information, see LICENSE.TXT +""" +scikit-learn (https://scikit-learn.org) is a machine learning library for +Python. It supports many classification algorithms, including SVMs, +Naive Bayes, logistic regression (MaxEnt) and decision trees. + +This package implements a wrapper around scikit-learn classifiers. To use this +wrapper, construct a scikit-learn estimator object, then use that to construct +a SklearnClassifier. E.g., to wrap a linear SVM with default settings: + +>>> from sklearn.svm import LinearSVC +>>> from nltk.classify.scikitlearn import SklearnClassifier +>>> classif = SklearnClassifier(LinearSVC()) + +A scikit-learn classifier may include preprocessing steps when it's wrapped +in a Pipeline object. The following constructs and wraps a Naive Bayes text +classifier with tf-idf weighting and chi-square feature selection to get the +best 1000 features: + +>>> from sklearn.feature_extraction.text import TfidfTransformer +>>> from sklearn.feature_selection import SelectKBest, chi2 +>>> from sklearn.naive_bayes import MultinomialNB +>>> from sklearn.pipeline import Pipeline +>>> pipeline = Pipeline([('tfidf', TfidfTransformer()), +... ('chi2', SelectKBest(chi2, k=1000)), +... ('nb', MultinomialNB())]) +>>> classif = SklearnClassifier(pipeline) +""" + +from nltk.classify.api import ClassifierI +from nltk.probability import DictionaryProbDist + +try: + from sklearn.feature_extraction import DictVectorizer + from sklearn.preprocessing import LabelEncoder +except ImportError: + pass + +__all__ = ["SklearnClassifier"] + + +class SklearnClassifier(ClassifierI): + """Wrapper for scikit-learn classifiers.""" + + def __init__(self, estimator, dtype=float, sparse=True): + """ + :param estimator: scikit-learn classifier object. + + :param dtype: data type used when building feature array. + scikit-learn estimators work exclusively on numeric data. The + default value should be fine for almost all situations. + + :param sparse: Whether to use sparse matrices internally. + The estimator must support these; not all scikit-learn classifiers + do (see their respective documentation and look for "sparse + matrix"). The default value is True, since most NLP problems + involve sparse feature sets. Setting this to False may take a + great amount of memory. + :type sparse: boolean. + """ + self._clf = estimator + self._encoder = LabelEncoder() + self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) + + def __repr__(self): + return "" % self._clf + + def classify_many(self, featuresets): + """Classify a batch of samples. + + :param featuresets: An iterable over featuresets, each a dict mapping + strings to either numbers, booleans or strings. + :return: The predicted class label for each input sample. + :rtype: list + """ + X = self._vectorizer.transform(featuresets) + classes = self._encoder.classes_ + return [classes[i] for i in self._clf.predict(X)] + + def prob_classify_many(self, featuresets): + """Compute per-class probabilities for a batch of samples. + + :param featuresets: An iterable over featuresets, each a dict mapping + strings to either numbers, booleans or strings. + :rtype: list of ``ProbDistI`` + """ + X = self._vectorizer.transform(featuresets) + y_proba_list = self._clf.predict_proba(X) + return [self._make_probdist(y_proba) for y_proba in y_proba_list] + + def labels(self): + """The class labels used by this classifier. + + :rtype: list + """ + return list(self._encoder.classes_) + + def train(self, labeled_featuresets): + """ + Train (fit) the scikit-learn estimator. + + :param labeled_featuresets: A list of ``(featureset, label)`` + where each ``featureset`` is a dict mapping strings to either + numbers, booleans or strings. + """ + + X, y = list(zip(*labeled_featuresets)) + X = self._vectorizer.fit_transform(X) + y = self._encoder.fit_transform(y) + self._clf.fit(X, y) + + return self + + def _make_probdist(self, y_proba): + classes = self._encoder.classes_ + return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)}) + + +if __name__ == "__main__": + from sklearn.linear_model import LogisticRegression + from sklearn.naive_bayes import BernoulliNB + + from nltk.classify.util import names_demo, names_demo_features + + # Bernoulli Naive Bayes is designed for binary classification. We set the + # binarize option to False since we know we're passing boolean features. + print("scikit-learn Naive Bayes:") + names_demo( + SklearnClassifier(BernoulliNB(binarize=False)).train, + features=names_demo_features, + ) + + # The C parameter on logistic regression (MaxEnt) controls regularization. + # The higher it's set, the less regularized the classifier is. + print("\n\nscikit-learn logistic regression:") + names_demo( + SklearnClassifier(LogisticRegression(C=1000)).train, + features=names_demo_features, + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/senna.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/senna.py new file mode 100644 index 0000000000000000000000000000000000000000..9ff140542d39ef7ed9c300b9c36834dcfffc515d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/senna.py @@ -0,0 +1,176 @@ +# Natural Language Toolkit: Senna Interface +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Rami Al-Rfou' +# URL: +# For license information, see LICENSE.TXT + +""" +A general interface to the SENNA pipeline that supports any of the +operations specified in SUPPORTED_OPERATIONS. + +Applying multiple operations at once has the speed advantage. For example, +Senna will automatically determine POS tags if you are extracting named +entities. Applying both of the operations will cost only the time of +extracting the named entities. + +The SENNA pipeline has a fixed maximum size of the sentences that it can read. +By default it is 1024 token/sentence. If you have larger sentences, changing +the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your +system specific binary should be rebuilt. Otherwise this could introduce +misalignment errors. + +The input is: + +- path to the directory that contains SENNA executables. If the path is incorrect, + Senna will automatically search for executable file specified in SENNA environment variable +- List of the operations needed to be performed. +- (optionally) the encoding of the input data (default:utf-8) + +Note: Unit tests for this module can be found in test/unit/test_senna.py + +>>> from nltk.classify import Senna +>>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP +>>> sent = 'Dusseldorf is an international business center'.split() +>>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP +[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), +('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] +""" + +from os import environ, path, sep +from platform import architecture, system +from subprocess import PIPE, Popen + +from nltk.tag.api import TaggerI + + +class Senna(TaggerI): + + SUPPORTED_OPERATIONS = ["pos", "chk", "ner"] + + def __init__(self, senna_path, operations, encoding="utf-8"): + self._encoding = encoding + self._path = path.normpath(senna_path) + sep + + # Verifies the existence of the executable on the self._path first + # senna_binary_file_1 = self.executable(self._path) + exe_file_1 = self.executable(self._path) + if not path.isfile(exe_file_1): + # Check for the system environment + if "SENNA" in environ: + # self._path = path.join(environ['SENNA'],'') + self._path = path.normpath(environ["SENNA"]) + sep + exe_file_2 = self.executable(self._path) + if not path.isfile(exe_file_2): + raise LookupError( + "Senna executable expected at %s or %s but not found" + % (exe_file_1, exe_file_2) + ) + + self.operations = operations + + def executable(self, base_path): + """ + The function that determines the system specific binary that should be + used in the pipeline. In case, the system is not known the default senna binary will + be used. + """ + os_name = system() + if os_name == "Linux": + bits = architecture()[0] + if bits == "64bit": + return path.join(base_path, "senna-linux64") + return path.join(base_path, "senna-linux32") + if os_name == "Windows": + return path.join(base_path, "senna-win32.exe") + if os_name == "Darwin": + return path.join(base_path, "senna-osx") + return path.join(base_path, "senna") + + def _map(self): + """ + A method that calculates the order of the columns that SENNA pipeline + will output the tags into. This depends on the operations being ordered. + """ + _map = {} + i = 1 + for operation in Senna.SUPPORTED_OPERATIONS: + if operation in self.operations: + _map[operation] = i + i += 1 + return _map + + def tag(self, tokens): + """ + Applies the specified operation(s) on a list of tokens. + """ + return self.tag_sents([tokens])[0] + + def tag_sents(self, sentences): + """ + Applies the tag method over a list of sentences. This method will return a + list of dictionaries. Every dictionary will contain a word with its + calculated annotations/tags. + """ + encoding = self._encoding + + if not path.isfile(self.executable(self._path)): + raise LookupError( + "Senna executable expected at %s but not found" + % self.executable(self._path) + ) + + # Build the senna command to run the tagger + _senna_cmd = [ + self.executable(self._path), + "-path", + self._path, + "-usrtokens", + "-iobtags", + ] + _senna_cmd.extend(["-" + op for op in self.operations]) + + # Serialize the actual sentences to a temporary string + _input = "\n".join(" ".join(x) for x in sentences) + "\n" + if isinstance(_input, str) and encoding: + _input = _input.encode(encoding) + + # Run the tagger and get the output + p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) + (stdout, stderr) = p.communicate(input=_input) + senna_output = stdout + + # Check the return code. + if p.returncode != 0: + raise RuntimeError("Senna command failed! Details: %s" % stderr) + + if encoding: + senna_output = stdout.decode(encoding) + + # Output the tagged sentences + map_ = self._map() + tagged_sentences = [[]] + sentence_index = 0 + token_index = 0 + for tagged_word in senna_output.strip().split("\n"): + if not tagged_word: + tagged_sentences.append([]) + sentence_index += 1 + token_index = 0 + continue + tags = tagged_word.split("\t") + result = {} + for tag in map_: + result[tag] = tags[map_[tag]].strip() + try: + result["word"] = sentences[sentence_index][token_index] + except IndexError as e: + raise IndexError( + "Misalignment error occurred at sentence number %d. Possible reason" + " is that the sentence size exceeded the maximum size. Check the " + "documentation of Senna class for more information." + % sentence_index + ) from e + tagged_sentences[-1].append(result) + token_index += 1 + return tagged_sentences diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/svm.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/svm.py new file mode 100644 index 0000000000000000000000000000000000000000..2d0046af4f700fa35b385645c3228d71f16bb9af --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/svm.py @@ -0,0 +1,17 @@ +# Natural Language Toolkit: SVM-based classifier +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Leon Derczynski +# +# URL: +# For license information, see LICENSE.TXT +""" +nltk.classify.svm was deprecated. For classification based +on support vector machines SVMs use nltk.classify.scikitlearn +(or `scikit-learn `_ directly). +""" + + +class SvmClassifier: + def __init__(self, *args, **kwargs): + raise NotImplementedError(__doc__) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/tadm.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/tadm.py new file mode 100644 index 0000000000000000000000000000000000000000..61687ed4e3a81a80c7c90b86f842c4142723842e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/tadm.py @@ -0,0 +1,122 @@ +# Natural Language Toolkit: Interface to TADM Classifier +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Joseph Frazee +# URL: +# For license information, see LICENSE.TXT + +import subprocess +import sys + +from nltk.internals import find_binary + +try: + import numpy +except ImportError: + pass + +_tadm_bin = None + + +def config_tadm(bin=None): + global _tadm_bin + _tadm_bin = find_binary( + "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net" + ) + + +def write_tadm_file(train_toks, encoding, stream): + """ + Generate an input file for ``tadm`` based on the given corpus of + classified tokens. + + :type train_toks: list(tuple(dict, str)) + :param train_toks: Training data, represented as a list of + pairs, the first member of which is a feature dictionary, + and the second of which is a classification label. + :type encoding: TadmEventMaxentFeatureEncoding + :param encoding: A feature encoding, used to convert featuresets + into feature vectors. + :type stream: stream + :param stream: The stream to which the ``tadm`` input file should be + written. + """ + # See the following for a file format description: + # + # https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054 + # https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054 + labels = encoding.labels() + for featureset, label in train_toks: + length_line = "%d\n" % len(labels) + stream.write(length_line) + for known_label in labels: + v = encoding.encode(featureset, known_label) + line = "%d %d %s\n" % ( + int(label == known_label), + len(v), + " ".join("%d %d" % u for u in v), + ) + stream.write(line) + + +def parse_tadm_weights(paramfile): + """ + Given the stdout output generated by ``tadm`` when training a + model, return a ``numpy`` array containing the corresponding weight + vector. + """ + weights = [] + for line in paramfile: + weights.append(float(line.strip())) + return numpy.array(weights, "d") + + +def call_tadm(args): + """ + Call the ``tadm`` binary with the given arguments. + """ + if isinstance(args, str): + raise TypeError("args should be a list of strings") + if _tadm_bin is None: + config_tadm() + + # Call tadm via a subprocess + cmd = [_tadm_bin] + args + p = subprocess.Popen(cmd, stdout=sys.stdout) + (stdout, stderr) = p.communicate() + + # Check the return code. + if p.returncode != 0: + print() + print(stderr) + raise OSError("tadm command failed!") + + +def names_demo(): + from nltk.classify.maxent import TadmMaxentClassifier + from nltk.classify.util import names_demo + + classifier = names_demo(TadmMaxentClassifier.train) + + +def encoding_demo(): + import sys + + from nltk.classify.maxent import TadmEventMaxentFeatureEncoding + + tokens = [ + ({"f0": 1, "f1": 1, "f3": 1}, "A"), + ({"f0": 1, "f2": 1, "f4": 1}, "B"), + ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"), + ] + encoding = TadmEventMaxentFeatureEncoding.train(tokens) + write_tadm_file(tokens, encoding, sys.stdout) + print() + for i in range(encoding.length()): + print("%s --> %d" % (encoding.describe(i), i)) + print() + + +if __name__ == "__main__": + encoding_demo() + names_demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/classify/textcat.py b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/textcat.py new file mode 100644 index 0000000000000000000000000000000000000000..df408726f14f1ea00442978c4ef110fa6eb02f2e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/classify/textcat.py @@ -0,0 +1,197 @@ +# Natural Language Toolkit: Language ID module using TextCat algorithm +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Avital Pekker +# +# URL: +# For license information, see LICENSE.TXT + +""" +A module for language identification using the TextCat algorithm. +An implementation of the text categorization algorithm +presented in Cavnar, W. B. and J. M. Trenkle, +"N-Gram-Based Text Categorization". + +The algorithm takes advantage of Zipf's law and uses +n-gram frequencies to profile languages and text-yet to +be identified-then compares using a distance measure. + +Language n-grams are provided by the "An Crubadan" +project. A corpus reader was created separately to read +those files. + +For details regarding the algorithm, see: +https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf + +For details about An Crubadan, see: +https://borel.slu.edu/crubadan/index.html +""" + +from sys import maxsize + +from nltk.util import trigrams + +# Note: this is NOT "re" you're likely used to. The regex module +# is an alternative to the standard re module that supports +# Unicode codepoint properties with the \p{} syntax. +# You may have to "pip install regx" +try: + import regex as re +except ImportError: + re = None +###################################################################### +## Language identification using TextCat +###################################################################### + + +class TextCat: + + _corpus = None + fingerprints = {} + _START_CHAR = "<" + _END_CHAR = ">" + + last_distances = {} + + def __init__(self): + if not re: + raise OSError( + "classify.textcat requires the regex module that " + "supports unicode. Try '$ pip install regex' and " + "see https://pypi.python.org/pypi/regex for " + "further details." + ) + + from nltk.corpus import crubadan + + self._corpus = crubadan + # Load all language ngrams into cache + for lang in self._corpus.langs(): + self._corpus.lang_freq(lang) + + def remove_punctuation(self, text): + """Get rid of punctuation except apostrophes""" + return re.sub(r"[^\P{P}\']+", "", text) + + def profile(self, text): + """Create FreqDist of trigrams within text""" + from nltk import FreqDist, word_tokenize + + clean_text = self.remove_punctuation(text) + tokens = word_tokenize(clean_text) + + fingerprint = FreqDist() + for t in tokens: + token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) + token_trigrams = ["".join(tri) for tri in token_trigram_tuples] + + for cur_trigram in token_trigrams: + if cur_trigram in fingerprint: + fingerprint[cur_trigram] += 1 + else: + fingerprint[cur_trigram] = 1 + + return fingerprint + + def calc_dist(self, lang, trigram, text_profile): + """Calculate the "out-of-place" measure between the + text and language profile for a single trigram""" + + lang_fd = self._corpus.lang_freq(lang) + dist = 0 + + if trigram in lang_fd: + idx_lang_profile = list(lang_fd.keys()).index(trigram) + idx_text = list(text_profile.keys()).index(trigram) + + # print(idx_lang_profile, ", ", idx_text) + dist = abs(idx_lang_profile - idx_text) + else: + # Arbitrary but should be larger than + # any possible trigram file length + # in terms of total lines + dist = maxsize + + return dist + + def lang_dists(self, text): + """Calculate the "out-of-place" measure between + the text and all languages""" + + distances = {} + profile = self.profile(text) + # For all the languages + for lang in self._corpus._all_lang_freq.keys(): + # Calculate distance metric for every trigram in + # input text to be identified + lang_dist = 0 + for trigram in profile: + lang_dist += self.calc_dist(lang, trigram, profile) + + distances[lang] = lang_dist + + return distances + + def guess_language(self, text): + """Find the language with the min distance + to the text and return its ISO 639-3 code""" + self.last_distances = self.lang_dists(text) + + return min(self.last_distances, key=self.last_distances.get) + #################################################') + + +def demo(): + from nltk.corpus import udhr + + langs = [ + "Kurdish-UTF8", + "Abkhaz-UTF8", + "Farsi_Persian-UTF8", + "Hindi-UTF8", + "Hawaiian-UTF8", + "Russian-UTF8", + "Vietnamese-UTF8", + "Serbian_Srpski-UTF8", + "Esperanto-UTF8", + ] + + friendly = { + "kmr": "Northern Kurdish", + "abk": "Abkhazian", + "pes": "Iranian Persian", + "hin": "Hindi", + "haw": "Hawaiian", + "rus": "Russian", + "vie": "Vietnamese", + "srp": "Serbian", + "epo": "Esperanto", + } + + tc = TextCat() + + for cur_lang in langs: + # Get raw data from UDHR corpus + raw_sentences = udhr.sents(cur_lang) + rows = len(raw_sentences) - 1 + cols = list(map(len, raw_sentences)) + + sample = "" + + # Generate a sample text of the language + for i in range(0, rows): + cur_sent = "" + for j in range(0, cols[i]): + cur_sent += " " + raw_sentences[i][j] + + sample += cur_sent + + # Try to detect what it is + print("Language snippet: " + sample[0:140] + "...") + guess = tc.guess_language(sample) + print(f"Language detection: {guess} ({friendly[guess]})") + print("#" * 140) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cli.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..f678221e6935f6e6d21538a6e578a49c02581f07 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cli.py @@ -0,0 +1,55 @@ +# Natural Language Toolkit: NLTK Command-Line Interface +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + + +import click +from tqdm import tqdm + +from nltk import word_tokenize +from nltk.util import parallelize_preprocess + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + + +@click.group(context_settings=CONTEXT_SETTINGS) +@click.version_option() +def cli(): + pass + + +@cli.command("tokenize") +@click.option( + "--language", + "-l", + default="en", + help="The language for the Punkt sentence tokenization.", +) +@click.option( + "--preserve-line", + "-l", + default=True, + is_flag=True, + help="An option to keep the preserve the sentence and not sentence tokenize it.", +) +@click.option("--processes", "-j", default=1, help="No. of processes.") +@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.") +@click.option( + "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens." +) +def tokenize_file(language, preserve_line, processes, encoding, delimiter): + """This command tokenizes text stream using nltk.word_tokenize""" + with click.get_text_stream("stdin", encoding=encoding) as fin: + with click.get_text_stream("stdout", encoding=encoding) as fout: + # If it's single process, joblib parallelization is slower, + # so just process line by line normally. + if processes == 1: + for line in tqdm(fin.readlines()): + print(delimiter.join(word_tokenize(line)), end="\n", file=fout) + else: + for outline in parallelize_preprocess( + word_tokenize, fin.readlines(), processes, progress_bar=True + ): + print(delimiter.join(outline), end="\n", file=fout) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/util.py new file mode 100644 index 0000000000000000000000000000000000000000..7be5e33317ed247156e0f092aff52d1a1e623778 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/cluster/util.py @@ -0,0 +1,300 @@ +# Natural Language Toolkit: Clusterer Utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Trevor Cohn +# Contributor: J Richard Snape +# URL: +# For license information, see LICENSE.TXT +import copy +from abc import abstractmethod +from math import sqrt +from sys import stdout + +try: + import numpy +except ImportError: + pass + +from nltk.cluster.api import ClusterI + + +class VectorSpaceClusterer(ClusterI): + """ + Abstract clusterer which takes tokens and maps them into a vector space. + Optionally performs singular value decomposition to reduce the + dimensionality. + """ + + def __init__(self, normalise=False, svd_dimensions=None): + """ + :param normalise: should vectors be normalised to length 1 + :type normalise: boolean + :param svd_dimensions: number of dimensions to use in reducing vector + dimensionsionality with SVD + :type svd_dimensions: int + """ + self._Tt = None + self._should_normalise = normalise + self._svd_dimensions = svd_dimensions + + def cluster(self, vectors, assign_clusters=False, trace=False): + assert len(vectors) > 0 + + # normalise the vectors + if self._should_normalise: + vectors = list(map(self._normalise, vectors)) + + # use SVD to reduce the dimensionality + if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): + [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) + S = d[: self._svd_dimensions] * numpy.identity( + self._svd_dimensions, numpy.float64 + ) + T = u[:, : self._svd_dimensions] + Dt = vt[: self._svd_dimensions, :] + vectors = numpy.transpose(numpy.dot(S, Dt)) + self._Tt = numpy.transpose(T) + + # call abstract method to cluster the vectors + self.cluster_vectorspace(vectors, trace) + + # assign the vectors to clusters + if assign_clusters: + return [self.classify(vector) for vector in vectors] + + @abstractmethod + def cluster_vectorspace(self, vectors, trace): + """ + Finds the clusters using the given set of vectors. + """ + + def classify(self, vector): + if self._should_normalise: + vector = self._normalise(vector) + if self._Tt is not None: + vector = numpy.dot(self._Tt, vector) + cluster = self.classify_vectorspace(vector) + return self.cluster_name(cluster) + + @abstractmethod + def classify_vectorspace(self, vector): + """ + Returns the index of the appropriate cluster for the vector. + """ + + def likelihood(self, vector, label): + if self._should_normalise: + vector = self._normalise(vector) + if self._Tt is not None: + vector = numpy.dot(self._Tt, vector) + return self.likelihood_vectorspace(vector, label) + + def likelihood_vectorspace(self, vector, cluster): + """ + Returns the likelihood of the vector belonging to the cluster. + """ + predicted = self.classify_vectorspace(vector) + return 1.0 if cluster == predicted else 0.0 + + def vector(self, vector): + """ + Returns the vector after normalisation and dimensionality reduction + """ + if self._should_normalise: + vector = self._normalise(vector) + if self._Tt is not None: + vector = numpy.dot(self._Tt, vector) + return vector + + def _normalise(self, vector): + """ + Normalises the vector to unit length. + """ + return vector / sqrt(numpy.dot(vector, vector)) + + +def euclidean_distance(u, v): + """ + Returns the euclidean distance between vectors u and v. This is equivalent + to the length of the vector (u - v). + """ + diff = u - v + return sqrt(numpy.dot(diff, diff)) + + +def cosine_distance(u, v): + """ + Returns 1 minus the cosine of the angle between vectors v and u. This is + equal to ``1 - (u.v / |u||v|)``. + """ + return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) + + +class _DendrogramNode: + """Tree node of a dendrogram.""" + + def __init__(self, value, *children): + self._value = value + self._children = children + + def leaves(self, values=True): + if self._children: + leaves = [] + for child in self._children: + leaves.extend(child.leaves(values)) + return leaves + elif values: + return [self._value] + else: + return [self] + + def groups(self, n): + queue = [(self._value, self)] + + while len(queue) < n: + priority, node = queue.pop() + if not node._children: + queue.push((priority, node)) + break + for child in node._children: + if child._children: + queue.append((child._value, child)) + else: + queue.append((0, child)) + # makes the earliest merges at the start, latest at the end + queue.sort() + + groups = [] + for priority, node in queue: + groups.append(node.leaves()) + return groups + + def __lt__(self, comparator): + return cosine_distance(self._value, comparator._value) < 0 + + +class Dendrogram: + """ + Represents a dendrogram, a tree with a specified branching order. This + must be initialised with the leaf items, then iteratively call merge for + each branch. This class constructs a tree representing the order of calls + to the merge function. + """ + + def __init__(self, items=[]): + """ + :param items: the items at the leaves of the dendrogram + :type items: sequence of (any) + """ + self._items = [_DendrogramNode(item) for item in items] + self._original_items = copy.copy(self._items) + self._merge = 1 + + def merge(self, *indices): + """ + Merges nodes at given indices in the dendrogram. The nodes will be + combined which then replaces the first node specified. All other nodes + involved in the merge will be removed. + + :param indices: indices of the items to merge (at least two) + :type indices: seq of int + """ + assert len(indices) >= 2 + node = _DendrogramNode(self._merge, *(self._items[i] for i in indices)) + self._merge += 1 + self._items[indices[0]] = node + for i in indices[1:]: + del self._items[i] + + def groups(self, n): + """ + Finds the n-groups of items (leaves) reachable from a cut at depth n. + :param n: number of groups + :type n: int + """ + if len(self._items) > 1: + root = _DendrogramNode(self._merge, *self._items) + else: + root = self._items[0] + return root.groups(n) + + def show(self, leaf_labels=[]): + """ + Print the dendrogram in ASCII art to standard out. + + :param leaf_labels: an optional list of strings to use for labeling the + leaves + :type leaf_labels: list + """ + + # ASCII rendering characters + JOIN, HLINK, VLINK = "+", "-", "|" + + # find the root (or create one) + if len(self._items) > 1: + root = _DendrogramNode(self._merge, *self._items) + else: + root = self._items[0] + leaves = self._original_items + + if leaf_labels: + last_row = leaf_labels + else: + last_row = ["%s" % leaf._value for leaf in leaves] + + # find the bottom row and the best cell width + width = max(map(len, last_row)) + 1 + lhalf = width // 2 + rhalf = int(width - lhalf - 1) + + # display functions + def format(centre, left=" ", right=" "): + return f"{lhalf * left}{centre}{right * rhalf}" + + def display(str): + stdout.write(str) + + # for each merge, top down + queue = [(root._value, root)] + verticals = [format(" ") for leaf in leaves] + while queue: + priority, node = queue.pop() + child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children)) + indices = list(map(leaves.index, child_left_leaf)) + if child_left_leaf: + min_idx = min(indices) + max_idx = max(indices) + for i in range(len(leaves)): + if leaves[i] in child_left_leaf: + if i == min_idx: + display(format(JOIN, " ", HLINK)) + elif i == max_idx: + display(format(JOIN, HLINK, " ")) + else: + display(format(JOIN, HLINK, HLINK)) + verticals[i] = format(VLINK) + elif min_idx <= i <= max_idx: + display(format(HLINK, HLINK, HLINK)) + else: + display(verticals[i]) + display("\n") + for child in node._children: + if child._children: + queue.append((child._value, child)) + queue.sort() + + for vertical in verticals: + display(vertical) + display("\n") + + # finally, display the last line + display("".join(item.center(width) for item in last_row)) + display("\n") + + def __repr__(self): + if len(self._items) > 1: + root = _DendrogramNode(self._merge, *self._items) + else: + root = self._items[0] + leaves = root.leaves(False) + return "" % len(leaves) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/collections.py b/.eggs/nltk-3.8-py3.10.egg/nltk/collections.py new file mode 100644 index 0000000000000000000000000000000000000000..92924b1e442c2e70c2ece4386e0b731d3ae43de0 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/collections.py @@ -0,0 +1,661 @@ +# Natural Language Toolkit: Collections +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +import bisect + +# this unused import is for python 2.7 +from collections import Counter, defaultdict, deque +from functools import total_ordering +from itertools import chain, islice + +from nltk.internals import raise_unorderable_types, slice_bounds + +########################################################################## +# Ordered Dictionary +########################################################################## + + +class OrderedDict(dict): + def __init__(self, data=None, **kwargs): + self._keys = self.keys(data, kwargs.get("keys")) + self._default_factory = kwargs.get("default_factory") + if data is None: + dict.__init__(self) + else: + dict.__init__(self, data) + + def __delitem__(self, key): + dict.__delitem__(self, key) + self._keys.remove(key) + + def __getitem__(self, key): + try: + return dict.__getitem__(self, key) + except KeyError: + return self.__missing__(key) + + def __iter__(self): + return (key for key in self.keys()) + + def __missing__(self, key): + if not self._default_factory and key not in self._keys: + raise KeyError() + return self._default_factory() + + def __setitem__(self, key, item): + dict.__setitem__(self, key, item) + if key not in self._keys: + self._keys.append(key) + + def clear(self): + dict.clear(self) + self._keys.clear() + + def copy(self): + d = dict.copy(self) + d._keys = self._keys + return d + + def items(self): + # returns iterator under python 3 and list under python 2 + return zip(self.keys(), self.values()) + + def keys(self, data=None, keys=None): + if data: + if keys: + assert isinstance(keys, list) + assert len(data) == len(keys) + return keys + else: + assert ( + isinstance(data, dict) + or isinstance(data, OrderedDict) + or isinstance(data, list) + ) + if isinstance(data, dict) or isinstance(data, OrderedDict): + return data.keys() + elif isinstance(data, list): + return [key for (key, value) in data] + elif "_keys" in self.__dict__: + return self._keys + else: + return [] + + def popitem(self): + if not self._keys: + raise KeyError() + + key = self._keys.pop() + value = self[key] + del self[key] + return (key, value) + + def setdefault(self, key, failobj=None): + dict.setdefault(self, key, failobj) + if key not in self._keys: + self._keys.append(key) + + def update(self, data): + dict.update(self, data) + for key in self.keys(data): + if key not in self._keys: + self._keys.append(key) + + def values(self): + # returns iterator under python 3 + return map(self.get, self._keys) + + +###################################################################### +# Lazy Sequences +###################################################################### + + +@total_ordering +class AbstractLazySequence: + """ + An abstract base class for read-only sequences whose values are + computed as needed. Lazy sequences act like tuples -- they can be + indexed, sliced, and iterated over; but they may not be modified. + + The most common application of lazy sequences in NLTK is for + corpus view objects, which provide access to the contents of a + corpus without loading the entire corpus into memory, by loading + pieces of the corpus from disk as needed. + + The result of modifying a mutable element of a lazy sequence is + undefined. In particular, the modifications made to the element + may or may not persist, depending on whether and when the lazy + sequence caches that element's value or reconstructs it from + scratch. + + Subclasses are required to define two methods: ``__len__()`` + and ``iterate_from()``. + """ + + def __len__(self): + """ + Return the number of tokens in the corpus file underlying this + corpus view. + """ + raise NotImplementedError("should be implemented by subclass") + + def iterate_from(self, start): + """ + Return an iterator that generates the tokens in the corpus + file underlying this corpus view, starting at the token number + ``start``. If ``start>=len(self)``, then this iterator will + generate no tokens. + """ + raise NotImplementedError("should be implemented by subclass") + + def __getitem__(self, i): + """ + Return the *i* th token in the corpus file underlying this + corpus view. Negative indices and spans are both supported. + """ + if isinstance(i, slice): + start, stop = slice_bounds(self, i) + return LazySubsequence(self, start, stop) + else: + # Handle negative indices + if i < 0: + i += len(self) + if i < 0: + raise IndexError("index out of range") + # Use iterate_from to extract it. + try: + return next(self.iterate_from(i)) + except StopIteration as e: + raise IndexError("index out of range") from e + + def __iter__(self): + """Return an iterator that generates the tokens in the corpus + file underlying this corpus view.""" + return self.iterate_from(0) + + def count(self, value): + """Return the number of times this list contains ``value``.""" + return sum(1 for elt in self if elt == value) + + def index(self, value, start=None, stop=None): + """Return the index of the first occurrence of ``value`` in this + list that is greater than or equal to ``start`` and less than + ``stop``. Negative start and stop values are treated like negative + slice bounds -- i.e., they count from the end of the list.""" + start, stop = slice_bounds(self, slice(start, stop)) + for i, elt in enumerate(islice(self, start, stop)): + if elt == value: + return i + start + raise ValueError("index(x): x not in list") + + def __contains__(self, value): + """Return true if this list contains ``value``.""" + return bool(self.count(value)) + + def __add__(self, other): + """Return a list concatenating self with other.""" + return LazyConcatenation([self, other]) + + def __radd__(self, other): + """Return a list concatenating other with self.""" + return LazyConcatenation([other, self]) + + def __mul__(self, count): + """Return a list concatenating self with itself ``count`` times.""" + return LazyConcatenation([self] * count) + + def __rmul__(self, count): + """Return a list concatenating self with itself ``count`` times.""" + return LazyConcatenation([self] * count) + + _MAX_REPR_SIZE = 60 + + def __repr__(self): + """ + Return a string representation for this corpus view that is + similar to a list's representation; but if it would be more + than 60 characters long, it is truncated. + """ + pieces = [] + length = 5 + for elt in self: + pieces.append(repr(elt)) + length += len(pieces[-1]) + 2 + if length > self._MAX_REPR_SIZE and len(pieces) > 2: + return "[%s, ...]" % ", ".join(pieces[:-1]) + return "[%s]" % ", ".join(pieces) + + def __eq__(self, other): + return type(self) == type(other) and list(self) == list(other) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if type(other) != type(self): + raise_unorderable_types("<", self, other) + return list(self) < list(other) + + def __hash__(self): + """ + :raise ValueError: Corpus view objects are unhashable. + """ + raise ValueError("%s objects are unhashable" % self.__class__.__name__) + + +class LazySubsequence(AbstractLazySequence): + """ + A subsequence produced by slicing a lazy sequence. This slice + keeps a reference to its source sequence, and generates its values + by looking them up in the source sequence. + """ + + MIN_SIZE = 100 + """ + The minimum size for which lazy slices should be created. If + ``LazySubsequence()`` is called with a subsequence that is + shorter than ``MIN_SIZE``, then a tuple will be returned instead. + """ + + def __new__(cls, source, start, stop): + """ + Construct a new slice from a given underlying sequence. The + ``start`` and ``stop`` indices should be absolute indices -- + i.e., they should not be negative (for indexing from the back + of a list) or greater than the length of ``source``. + """ + # If the slice is small enough, just use a tuple. + if stop - start < cls.MIN_SIZE: + return list(islice(source.iterate_from(start), stop - start)) + else: + return object.__new__(cls) + + def __init__(self, source, start, stop): + self._source = source + self._start = start + self._stop = stop + + def __len__(self): + return self._stop - self._start + + def iterate_from(self, start): + return islice( + self._source.iterate_from(start + self._start), max(0, len(self) - start) + ) + + +class LazyConcatenation(AbstractLazySequence): + """ + A lazy sequence formed by concatenating a list of lists. This + underlying list of lists may itself be lazy. ``LazyConcatenation`` + maintains an index that it uses to keep track of the relationship + between offsets in the concatenated lists and offsets in the + sublists. + """ + + def __init__(self, list_of_lists): + self._list = list_of_lists + self._offsets = [0] + + def __len__(self): + if len(self._offsets) <= len(self._list): + for _ in self.iterate_from(self._offsets[-1]): + pass + return self._offsets[-1] + + def iterate_from(self, start_index): + if start_index < self._offsets[-1]: + sublist_index = bisect.bisect_right(self._offsets, start_index) - 1 + else: + sublist_index = len(self._offsets) - 1 + + index = self._offsets[sublist_index] + + # Construct an iterator over the sublists. + if isinstance(self._list, AbstractLazySequence): + sublist_iter = self._list.iterate_from(sublist_index) + else: + sublist_iter = islice(self._list, sublist_index, None) + + for sublist in sublist_iter: + if sublist_index == (len(self._offsets) - 1): + assert ( + index + len(sublist) >= self._offsets[-1] + ), "offsets not monotonic increasing!" + self._offsets.append(index + len(sublist)) + else: + assert self._offsets[sublist_index + 1] == index + len( + sublist + ), "inconsistent list value (num elts)" + + yield from sublist[max(0, start_index - index) :] + + index += len(sublist) + sublist_index += 1 + + +class LazyMap(AbstractLazySequence): + """ + A lazy sequence whose elements are formed by applying a given + function to each element in one or more underlying lists. The + function is applied lazily -- i.e., when you read a value from the + list, ``LazyMap`` will calculate that value by applying its + function to the underlying lists' value(s). ``LazyMap`` is + essentially a lazy version of the Python primitive function + ``map``. In particular, the following two expressions are + equivalent: + + >>> from nltk.collections import LazyMap + >>> function = str + >>> sequence = [1,2,3] + >>> map(function, sequence) # doctest: +SKIP + ['1', '2', '3'] + >>> list(LazyMap(function, sequence)) + ['1', '2', '3'] + + Like the Python ``map`` primitive, if the source lists do not have + equal size, then the value None will be supplied for the + 'missing' elements. + + Lazy maps can be useful for conserving memory, in cases where + individual values take up a lot of space. This is especially true + if the underlying list's values are constructed lazily, as is the + case with many corpus readers. + + A typical example of a use case for this class is performing + feature detection on the tokens in a corpus. Since featuresets + are encoded as dictionaries, which can take up a lot of memory, + using a ``LazyMap`` can significantly reduce memory usage when + training and running classifiers. + """ + + def __init__(self, function, *lists, **config): + """ + :param function: The function that should be applied to + elements of ``lists``. It should take as many arguments + as there are ``lists``. + :param lists: The underlying lists. + :param cache_size: Determines the size of the cache used + by this lazy map. (default=5) + """ + if not lists: + raise TypeError("LazyMap requires at least two args") + + self._lists = lists + self._func = function + self._cache_size = config.get("cache_size", 5) + self._cache = {} if self._cache_size > 0 else None + + # If you just take bool() of sum() here _all_lazy will be true just + # in case n >= 1 list is an AbstractLazySequence. Presumably this + # isn't what's intended. + self._all_lazy = sum( + isinstance(lst, AbstractLazySequence) for lst in lists + ) == len(lists) + + def iterate_from(self, index): + # Special case: one lazy sublist + if len(self._lists) == 1 and self._all_lazy: + for value in self._lists[0].iterate_from(index): + yield self._func(value) + return + + # Special case: one non-lazy sublist + elif len(self._lists) == 1: + while True: + try: + yield self._func(self._lists[0][index]) + except IndexError: + return + index += 1 + + # Special case: n lazy sublists + elif self._all_lazy: + iterators = [lst.iterate_from(index) for lst in self._lists] + while True: + elements = [] + for iterator in iterators: + try: + elements.append(next(iterator)) + except: # FIXME: What is this except really catching? StopIteration? + elements.append(None) + if elements == [None] * len(self._lists): + return + yield self._func(*elements) + index += 1 + + # general case + else: + while True: + try: + elements = [lst[index] for lst in self._lists] + except IndexError: + elements = [None] * len(self._lists) + for i, lst in enumerate(self._lists): + try: + elements[i] = lst[index] + except IndexError: + pass + if elements == [None] * len(self._lists): + return + yield self._func(*elements) + index += 1 + + def __getitem__(self, index): + if isinstance(index, slice): + sliced_lists = [lst[index] for lst in self._lists] + return LazyMap(self._func, *sliced_lists) + else: + # Handle negative indices + if index < 0: + index += len(self) + if index < 0: + raise IndexError("index out of range") + # Check the cache + if self._cache is not None and index in self._cache: + return self._cache[index] + # Calculate the value + try: + val = next(self.iterate_from(index)) + except StopIteration as e: + raise IndexError("index out of range") from e + # Update the cache + if self._cache is not None: + if len(self._cache) > self._cache_size: + self._cache.popitem() # discard random entry + self._cache[index] = val + # Return the value + return val + + def __len__(self): + return max(len(lst) for lst in self._lists) + + +class LazyZip(LazyMap): + """ + A lazy sequence whose elements are tuples, each containing the i-th + element from each of the argument sequences. The returned list is + truncated in length to the length of the shortest argument sequence. The + tuples are constructed lazily -- i.e., when you read a value from the + list, ``LazyZip`` will calculate that value by forming a tuple from + the i-th element of each of the argument sequences. + + ``LazyZip`` is essentially a lazy version of the Python primitive function + ``zip``. In particular, an evaluated LazyZip is equivalent to a zip: + + >>> from nltk.collections import LazyZip + >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c'] + >>> zip(sequence1, sequence2) # doctest: +SKIP + [(1, 'a'), (2, 'b'), (3, 'c')] + >>> list(LazyZip(sequence1, sequence2)) + [(1, 'a'), (2, 'b'), (3, 'c')] + >>> sequences = [sequence1, sequence2, [6,7,8,9]] + >>> list(zip(*sequences)) == list(LazyZip(*sequences)) + True + + Lazy zips can be useful for conserving memory in cases where the argument + sequences are particularly long. + + A typical example of a use case for this class is combining long sequences + of gold standard and predicted values in a classification or tagging task + in order to calculate accuracy. By constructing tuples lazily and + avoiding the creation of an additional long sequence, memory usage can be + significantly reduced. + """ + + def __init__(self, *lists): + """ + :param lists: the underlying lists + :type lists: list(list) + """ + LazyMap.__init__(self, lambda *elts: elts, *lists) + + def iterate_from(self, index): + iterator = LazyMap.iterate_from(self, index) + while index < len(self): + yield next(iterator) + index += 1 + return + + def __len__(self): + return min(len(lst) for lst in self._lists) + + +class LazyEnumerate(LazyZip): + """ + A lazy sequence whose elements are tuples, each containing a count (from + zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is + useful for obtaining an indexed list. The tuples are constructed lazily + -- i.e., when you read a value from the list, ``LazyEnumerate`` will + calculate that value by forming a tuple from the count of the i-th + element and the i-th element of the underlying sequence. + + ``LazyEnumerate`` is essentially a lazy version of the Python primitive + function ``enumerate``. In particular, the following two expressions are + equivalent: + + >>> from nltk.collections import LazyEnumerate + >>> sequence = ['first', 'second', 'third'] + >>> list(enumerate(sequence)) + [(0, 'first'), (1, 'second'), (2, 'third')] + >>> list(LazyEnumerate(sequence)) + [(0, 'first'), (1, 'second'), (2, 'third')] + + Lazy enumerations can be useful for conserving memory in cases where the + argument sequences are particularly long. + + A typical example of a use case for this class is obtaining an indexed + list for a long sequence of values. By constructing tuples lazily and + avoiding the creation of an additional long sequence, memory usage can be + significantly reduced. + """ + + def __init__(self, lst): + """ + :param lst: the underlying list + :type lst: list + """ + LazyZip.__init__(self, range(len(lst)), lst) + + +class LazyIteratorList(AbstractLazySequence): + """ + Wraps an iterator, loading its elements on demand + and making them subscriptable. + __repr__ displays only the first few elements. + """ + + def __init__(self, it, known_len=None): + self._it = it + self._len = known_len + self._cache = [] + + def __len__(self): + if self._len: + return self._len + for _ in self.iterate_from(len(self._cache)): + pass + self._len = len(self._cache) + return self._len + + def iterate_from(self, start): + """Create a new iterator over this list starting at the given offset.""" + while len(self._cache) < start: + v = next(self._it) + self._cache.append(v) + i = start + while i < len(self._cache): + yield self._cache[i] + i += 1 + try: + while True: + v = next(self._it) + self._cache.append(v) + yield v + except StopIteration: + pass + + def __add__(self, other): + """Return a list concatenating self with other.""" + return type(self)(chain(self, other)) + + def __radd__(self, other): + """Return a list concatenating other with self.""" + return type(self)(chain(other, self)) + + +###################################################################### +# Trie Implementation +###################################################################### +class Trie(dict): + """A Trie implementation for strings""" + + LEAF = True + + def __init__(self, strings=None): + """Builds a Trie object, which is built around a ``dict`` + + If ``strings`` is provided, it will add the ``strings``, which + consist of a ``list`` of ``strings``, to the Trie. + Otherwise, it'll construct an empty Trie. + + :param strings: List of strings to insert into the trie + (Default is ``None``) + :type strings: list(str) + + """ + super().__init__() + if strings: + for string in strings: + self.insert(string) + + def insert(self, string): + """Inserts ``string`` into the Trie + + :param string: String to insert into the trie + :type string: str + + :Example: + + >>> from nltk.collections import Trie + >>> trie = Trie(["abc", "def"]) + >>> expected = {'a': {'b': {'c': {True: None}}}, \ + 'd': {'e': {'f': {True: None}}}} + >>> trie == expected + True + + """ + if len(string): + self[string[0]].insert(string[1:]) + else: + # mark the string is complete + self[Trie.LEAF] = None + + def __missing__(self, key): + self[key] = Trie() + return self[key] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/collocations.py b/.eggs/nltk-3.8-py3.10.egg/nltk/collocations.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc17e6065d8e72d5e8d35dc391b9ec742457a0a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/collocations.py @@ -0,0 +1,412 @@ +# Natural Language Toolkit: Collocations and Association Measures +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Joel Nothman +# URL: +# For license information, see LICENSE.TXT +# +""" +Tools to identify collocations --- words that often appear consecutively +--- within corpora. They may also be used to find other associations between +word occurrences. +See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf +and the Text::NSP Perl package at http://ngram.sourceforge.net + +Finding collocations requires first calculating the frequencies of words and +their appearance in the context of other words. Often the collection of words +will then requiring filtering to only retain useful content terms. Each ngram +of words may then be scored according to some association measure, in order +to determine the relative likelihood of each ngram being a collocation. + +The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide +these functionalities, dependent on being provided a function which scores a +ngram given appropriate frequency counts. A number of standard association +measures are provided in bigram_measures and trigram_measures. +""" + +# Possible TODOs: +# - consider the distinction between f(x,_) and f(x) and whether our +# approximation is good enough for fragmented data, and mention it +# - add a n-gram collocation finder with measures which only utilise n-gram +# and unigram counts (raw_freq, pmi, student_t) + +import itertools as _itertools + +# these two unused imports are referenced in collocations.doctest +from nltk.metrics import ( + BigramAssocMeasures, + ContingencyMeasures, + QuadgramAssocMeasures, + TrigramAssocMeasures, +) +from nltk.metrics.spearman import ranks_from_scores, spearman_correlation +from nltk.probability import FreqDist +from nltk.util import ngrams + + +class AbstractCollocationFinder: + """ + An abstract base class for collocation finders whose purpose is to + collect collocation candidate frequencies, filter and rank them. + + As a minimum, collocation finders require the frequencies of each + word in a corpus, and the joint frequency of word tuples. This data + should be provided through nltk.probability.FreqDist objects or an + identical interface. + """ + + def __init__(self, word_fd, ngram_fd): + self.word_fd = word_fd + self.N = word_fd.N() + self.ngram_fd = ngram_fd + + @classmethod + def _build_new_documents( + cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None + ): + """ + Pad the document with the place holder according to the window_size + """ + padding = (pad_symbol,) * (window_size - 1) + if pad_right: + return _itertools.chain.from_iterable( + _itertools.chain(doc, padding) for doc in documents + ) + if pad_left: + return _itertools.chain.from_iterable( + _itertools.chain(padding, doc) for doc in documents + ) + + @classmethod + def from_documents(cls, documents): + """Constructs a collocation finder given a collection of documents, + each of which is a list (or iterable) of tokens. + """ + # return cls.from_words(_itertools.chain(*documents)) + return cls.from_words( + cls._build_new_documents(documents, cls.default_ws, pad_right=True) + ) + + @staticmethod + def _ngram_freqdist(words, n): + return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1)) + + def _apply_filter(self, fn=lambda ngram, freq: False): + """Generic filter removes ngrams from the frequency distribution + if the function returns True when passed an ngram tuple. + """ + tmp_ngram = FreqDist() + for ngram, freq in self.ngram_fd.items(): + if not fn(ngram, freq): + tmp_ngram[ngram] = freq + self.ngram_fd = tmp_ngram + + def apply_freq_filter(self, min_freq): + """Removes candidate ngrams which have frequency less than min_freq.""" + self._apply_filter(lambda ng, freq: freq < min_freq) + + def apply_ngram_filter(self, fn): + """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...) + evaluates to True. + """ + self._apply_filter(lambda ng, f: fn(*ng)) + + def apply_word_filter(self, fn): + """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2), + ...) evaluates to True. + """ + self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) + + def _score_ngrams(self, score_fn): + """Generates of (ngram, score) pairs as determined by the scoring + function provided. + """ + for tup in self.ngram_fd: + score = self.score_ngram(score_fn, *tup) + if score is not None: + yield tup, score + + def score_ngrams(self, score_fn): + """Returns a sequence of (ngram, score) pairs ordered from highest to + lowest score, as determined by the scoring function provided. + """ + return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0])) + + def nbest(self, score_fn, n): + """Returns the top n ngrams when scored by the given function.""" + return [p for p, s in self.score_ngrams(score_fn)[:n]] + + def above_score(self, score_fn, min_score): + """Returns a sequence of ngrams, ordered by decreasing score, whose + scores each exceed the given minimum score. + """ + for ngram, score in self.score_ngrams(score_fn): + if score > min_score: + yield ngram + else: + break + + +class BigramCollocationFinder(AbstractCollocationFinder): + """A tool for the finding and ranking of bigram collocations or other + association measures. It is often useful to use from_words() rather than + constructing an instance directly. + """ + + default_ws = 2 + + def __init__(self, word_fd, bigram_fd, window_size=2): + """Construct a BigramCollocationFinder, given FreqDists for + appearances of words and (possibly non-contiguous) bigrams. + """ + AbstractCollocationFinder.__init__(self, word_fd, bigram_fd) + self.window_size = window_size + + @classmethod + def from_words(cls, words, window_size=2): + """Construct a BigramCollocationFinder for all bigrams in the given + sequence. When window_size > 2, count non-contiguous bigrams, in the + style of Church and Hanks's (1990) association ratio. + """ + wfd = FreqDist() + bfd = FreqDist() + + if window_size < 2: + raise ValueError("Specify window_size at least 2") + + for window in ngrams(words, window_size, pad_right=True): + w1 = window[0] + if w1 is None: + continue + wfd[w1] += 1 + for w2 in window[1:]: + if w2 is not None: + bfd[(w1, w2)] += 1 + return cls(wfd, bfd, window_size=window_size) + + def score_ngram(self, score_fn, w1, w2): + """Returns the score for a given bigram using the given scoring + function. Following Church and Hanks (1990), counts are scaled by + a factor of 1/(window_size - 1). + """ + n_all = self.N + n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0) + if not n_ii: + return + n_ix = self.word_fd[w1] + n_xi = self.word_fd[w2] + return score_fn(n_ii, (n_ix, n_xi), n_all) + + +class TrigramCollocationFinder(AbstractCollocationFinder): + """A tool for the finding and ranking of trigram collocations or other + association measures. It is often useful to use from_words() rather than + constructing an instance directly. + """ + + default_ws = 3 + + def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd): + """Construct a TrigramCollocationFinder, given FreqDists for + appearances of words, bigrams, two words with any word between them, + and trigrams. + """ + AbstractCollocationFinder.__init__(self, word_fd, trigram_fd) + self.wildcard_fd = wildcard_fd + self.bigram_fd = bigram_fd + + @classmethod + def from_words(cls, words, window_size=3): + """Construct a TrigramCollocationFinder for all trigrams in the given + sequence. + """ + if window_size < 3: + raise ValueError("Specify window_size at least 3") + + wfd = FreqDist() + wildfd = FreqDist() + bfd = FreqDist() + tfd = FreqDist() + for window in ngrams(words, window_size, pad_right=True): + w1 = window[0] + if w1 is None: + continue + for w2, w3 in _itertools.combinations(window[1:], 2): + wfd[w1] += 1 + if w2 is None: + continue + bfd[(w1, w2)] += 1 + if w3 is None: + continue + wildfd[(w1, w3)] += 1 + tfd[(w1, w2, w3)] += 1 + return cls(wfd, bfd, wildfd, tfd) + + def bigram_finder(self): + """Constructs a bigram collocation finder with the bigram and unigram + data from this finder. Note that this does not include any filtering + applied to this finder. + """ + return BigramCollocationFinder(self.word_fd, self.bigram_fd) + + def score_ngram(self, score_fn, w1, w2, w3): + """Returns the score for a given trigram using the given scoring + function. + """ + n_all = self.N + n_iii = self.ngram_fd[(w1, w2, w3)] + if not n_iii: + return + n_iix = self.bigram_fd[(w1, w2)] + n_ixi = self.wildcard_fd[(w1, w3)] + n_xii = self.bigram_fd[(w2, w3)] + n_ixx = self.word_fd[w1] + n_xix = self.word_fd[w2] + n_xxi = self.word_fd[w3] + return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all) + + +class QuadgramCollocationFinder(AbstractCollocationFinder): + """A tool for the finding and ranking of quadgram collocations or other association measures. + It is often useful to use from_words() rather than constructing an instance directly. + """ + + default_ws = 4 + + def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii): + """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, + bigrams, trigrams, two words with one word and two words between them, three words + with a word between them in both variations. + """ + AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) + self.iii = iii + self.ii = ii + self.ixi = ixi + self.ixxi = ixxi + self.iixi = iixi + self.ixii = ixii + + @classmethod + def from_words(cls, words, window_size=4): + if window_size < 4: + raise ValueError("Specify window_size at least 4") + ixxx = FreqDist() + iiii = FreqDist() + ii = FreqDist() + iii = FreqDist() + ixi = FreqDist() + ixxi = FreqDist() + iixi = FreqDist() + ixii = FreqDist() + + for window in ngrams(words, window_size, pad_right=True): + w1 = window[0] + if w1 is None: + continue + for w2, w3, w4 in _itertools.combinations(window[1:], 3): + ixxx[w1] += 1 + if w2 is None: + continue + ii[(w1, w2)] += 1 + if w3 is None: + continue + iii[(w1, w2, w3)] += 1 + ixi[(w1, w3)] += 1 + if w4 is None: + continue + iiii[(w1, w2, w3, w4)] += 1 + ixxi[(w1, w4)] += 1 + ixii[(w1, w3, w4)] += 1 + iixi[(w1, w2, w4)] += 1 + + return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) + + def score_ngram(self, score_fn, w1, w2, w3, w4): + n_all = self.N + n_iiii = self.ngram_fd[(w1, w2, w3, w4)] + if not n_iiii: + return + n_iiix = self.iii[(w1, w2, w3)] + n_xiii = self.iii[(w2, w3, w4)] + n_iixi = self.iixi[(w1, w2, w4)] + n_ixii = self.ixii[(w1, w3, w4)] + + n_iixx = self.ii[(w1, w2)] + n_xxii = self.ii[(w3, w4)] + n_xiix = self.ii[(w2, w3)] + n_ixix = self.ixi[(w1, w3)] + n_ixxi = self.ixxi[(w1, w4)] + n_xixi = self.ixi[(w2, w4)] + + n_ixxx = self.word_fd[w1] + n_xixx = self.word_fd[w2] + n_xxix = self.word_fd[w3] + n_xxxi = self.word_fd[w4] + return score_fn( + n_iiii, + (n_iiix, n_iixi, n_ixii, n_xiii), + (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), + (n_ixxx, n_xixx, n_xxix, n_xxxi), + n_all, + ) + + +def demo(scorer=None, compare_scorer=None): + """Finds bigram collocations in the files of the WebText corpus.""" + from nltk.metrics import ( + BigramAssocMeasures, + ranks_from_scores, + spearman_correlation, + ) + + if scorer is None: + scorer = BigramAssocMeasures.likelihood_ratio + if compare_scorer is None: + compare_scorer = BigramAssocMeasures.raw_freq + + from nltk.corpus import stopwords, webtext + + ignored_words = stopwords.words("english") + word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words + + for file in webtext.fileids(): + words = [word.lower() for word in webtext.words(file)] + + cf = BigramCollocationFinder.from_words(words) + cf.apply_freq_filter(3) + cf.apply_word_filter(word_filter) + + corr = spearman_correlation( + ranks_from_scores(cf.score_ngrams(scorer)), + ranks_from_scores(cf.score_ngrams(compare_scorer)), + ) + print(file) + print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)]) + print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}") + + +# Slows down loading too much +# bigram_measures = BigramAssocMeasures() +# trigram_measures = TrigramAssocMeasures() + +if __name__ == "__main__": + import sys + + from nltk.metrics import BigramAssocMeasures + + try: + scorer = eval("BigramAssocMeasures." + sys.argv[1]) + except IndexError: + scorer = None + try: + compare_scorer = eval("BigramAssocMeasures." + sys.argv[2]) + except IndexError: + compare_scorer = None + + demo(scorer, compare_scorer) + +__all__ = [ + "BigramCollocationFinder", + "TrigramCollocationFinder", + "QuadgramCollocationFinder", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/compat.py b/.eggs/nltk-3.8-py3.10.egg/nltk/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..1153e1223b65ea49dafc5d4ad6a2ff94b0fffacc --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/compat.py @@ -0,0 +1,43 @@ +# Natural Language Toolkit: Compatibility +# +# Copyright (C) 2001-2022 NLTK Project +# +# URL: +# For license information, see LICENSE.TXT + +import os +from functools import wraps + +# ======= Compatibility for datasets that care about Python versions ======== + +# The following datasets have a /PY3 subdirectory containing +# a full copy of the data which has been re-encoded or repickled. +DATA_UPDATES = [ + ("chunkers", "maxent_ne_chunker"), + ("help", "tagsets"), + ("taggers", "maxent_treebank_pos_tagger"), + ("tokenizers", "punkt"), +] + +_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES] + + +def add_py3_data(path): + for item in _PY3_DATA_UPDATES: + if item in str(path) and "/PY3" not in str(path): + pos = path.index(item) + len(item) + if path[pos : pos + 4] == ".zip": + pos += 4 + path = path[:pos] + "/PY3" + path[pos:] + break + return path + + +# for use in adding /PY3 to the second (filename) argument +# of the file pointers in data.py +def py3_data(init_func): + def _decorator(*args, **kwargs): + args = (args[0], add_py3_data(args[1])) + args[2:] + return init_func(*args, **kwargs) + + return wraps(init_func)(_decorator) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/data.py b/.eggs/nltk-3.8-py3.10.egg/nltk/data.py new file mode 100644 index 0000000000000000000000000000000000000000..4be6053174ad2737da044fbd52dc0d6af0b428c4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/data.py @@ -0,0 +1,1441 @@ +# Natural Language Toolkit: Utility functions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Functions to find and load NLTK resource files, such as corpora, +grammars, and saved processing objects. Resource files are identified +using URLs, such as ``nltk:corpora/abc/rural.txt`` or +``http://nltk.org/sample/toy.cfg``. The following URL protocols are +supported: + + - ``file:path``: Specifies the file whose path is *path*. + Both relative and absolute paths may be used. + + - ``https://host/path``: Specifies the file stored on the web + server *host* at path *path*. + + - ``nltk:path``: Specifies the file stored in the NLTK data + package at *path*. NLTK will search for these files in the + directories specified by ``nltk.data.path``. + +If no protocol is specified, then the default protocol ``nltk:`` will +be used. + +This module provides to functions that can be used to access a +resource file, given its URL: ``load()`` loads a given resource, and +adds it to a resource cache; and ``retrieve()`` copies a given resource +to a local file. +""" + +import codecs +import functools +import os +import pickle +import re +import sys +import textwrap +import zipfile +from abc import ABCMeta, abstractmethod +from gzip import WRITE as GZ_WRITE +from gzip import GzipFile +from io import BytesIO, TextIOWrapper +from urllib.request import url2pathname, urlopen + +try: + from zlib import Z_SYNC_FLUSH as FLUSH +except ImportError: + from zlib import Z_FINISH as FLUSH + +from nltk import grammar, sem +from nltk.compat import add_py3_data, py3_data +from nltk.internals import deprecated + +textwrap_indent = functools.partial(textwrap.indent, prefix=" ") + +###################################################################### +# Search Path +###################################################################### + +path = [] +"""A list of directories where the NLTK data package might reside. + These directories will be checked in order when looking for a + resource in the data package. Note that this allows users to + substitute in their own versions of resources, if they have them + (e.g., in their home directory under ~/nltk_data).""" + +# User-specified locations: +_paths_from_env = os.environ.get("NLTK_DATA", "").split(os.pathsep) +path += [d for d in _paths_from_env if d] +if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/": + path.append(os.path.expanduser("~/nltk_data")) + +if sys.platform.startswith("win"): + # Common locations on Windows: + path += [ + os.path.join(sys.prefix, "nltk_data"), + os.path.join(sys.prefix, "share", "nltk_data"), + os.path.join(sys.prefix, "lib", "nltk_data"), + os.path.join(os.environ.get("APPDATA", "C:\\"), "nltk_data"), + r"C:\nltk_data", + r"D:\nltk_data", + r"E:\nltk_data", + ] +else: + # Common locations on UNIX & OS X: + path += [ + os.path.join(sys.prefix, "nltk_data"), + os.path.join(sys.prefix, "share", "nltk_data"), + os.path.join(sys.prefix, "lib", "nltk_data"), + "/usr/share/nltk_data", + "/usr/local/share/nltk_data", + "/usr/lib/nltk_data", + "/usr/local/lib/nltk_data", + ] + + +###################################################################### +# Util Functions +###################################################################### + + +def gzip_open_unicode( + filename, + mode="rb", + compresslevel=9, + encoding="utf-8", + fileobj=None, + errors=None, + newline=None, +): + if fileobj is None: + fileobj = GzipFile(filename, mode, compresslevel, fileobj) + return TextIOWrapper(fileobj, encoding, errors, newline) + + +def split_resource_url(resource_url): + """ + Splits a resource url into ":". + + >>> windows = sys.platform.startswith('win') + >>> split_resource_url('nltk:home/nltk') + ('nltk', 'home/nltk') + >>> split_resource_url('nltk:/home/nltk') + ('nltk', '/home/nltk') + >>> split_resource_url('file:/home/nltk') + ('file', '/home/nltk') + >>> split_resource_url('file:///home/nltk') + ('file', '/home/nltk') + >>> split_resource_url('file:///C:/home/nltk') + ('file', '/C:/home/nltk') + """ + protocol, path_ = resource_url.split(":", 1) + if protocol == "nltk": + pass + elif protocol == "file": + if path_.startswith("/"): + path_ = "/" + path_.lstrip("/") + else: + path_ = re.sub(r"^/{0,2}", "", path_) + return protocol, path_ + + +def normalize_resource_url(resource_url): + r""" + Normalizes a resource url + + >>> windows = sys.platform.startswith('win') + >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \ + ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg')) + True + >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file' + True + >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file' + True + >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg' + True + >>> normalize_resource_url('nltk:home/nltk') + 'nltk:home/nltk' + >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk' + True + >>> normalize_resource_url('https://example.com/dir/file') + 'https://example.com/dir/file' + >>> normalize_resource_url('dir/file') + 'nltk:dir/file' + """ + try: + protocol, name = split_resource_url(resource_url) + except ValueError: + # the resource url has no protocol, use the nltk protocol by default + protocol = "nltk" + name = resource_url + # use file protocol if the path is an absolute path + if protocol == "nltk" and os.path.isabs(name): + protocol = "file://" + name = normalize_resource_name(name, False, None) + elif protocol == "file": + protocol = "file://" + # name is absolute + name = normalize_resource_name(name, False, None) + elif protocol == "nltk": + protocol = "nltk:" + name = normalize_resource_name(name, True) + else: + # handled by urllib + protocol += "://" + return "".join([protocol, name]) + + +def normalize_resource_name(resource_name, allow_relative=True, relative_path=None): + """ + :type resource_name: str or unicode + :param resource_name: The name of the resource to search for. + Resource names are posix-style relative path names, such as + ``corpora/brown``. Directory names will automatically + be converted to a platform-appropriate path separator. + Directory trailing slashes are preserved + + >>> windows = sys.platform.startswith('win') + >>> normalize_resource_name('.', True) + './' + >>> normalize_resource_name('./', True) + './' + >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file' + True + >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file' + True + >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file' + True + >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file' + True + >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file' + True + >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file' + True + """ + is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith( + os.path.sep + ) + if sys.platform.startswith("win"): + resource_name = resource_name.lstrip("/") + else: + resource_name = re.sub(r"^/+", "/", resource_name) + if allow_relative: + resource_name = os.path.normpath(resource_name) + else: + if relative_path is None: + relative_path = os.curdir + resource_name = os.path.abspath(os.path.join(relative_path, resource_name)) + resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/") + if sys.platform.startswith("win") and os.path.isabs(resource_name): + resource_name = "/" + resource_name + if is_dir and not resource_name.endswith("/"): + resource_name += "/" + return resource_name + + +###################################################################### +# Path Pointers +###################################################################### + + +class PathPointer(metaclass=ABCMeta): + """ + An abstract base class for 'path pointers,' used by NLTK's data + package to identify specific paths. Two subclasses exist: + ``FileSystemPathPointer`` identifies a file that can be accessed + directly via a given absolute path. ``ZipFilePathPointer`` + identifies a file contained within a zipfile, that can be accessed + by reading that zipfile. + """ + + @abstractmethod + def open(self, encoding=None): + """ + Return a seekable read-only stream that can be used to read + the contents of the file identified by this path pointer. + + :raise IOError: If the path specified by this pointer does + not contain a readable file. + """ + + @abstractmethod + def file_size(self): + """ + Return the size of the file pointed to by this path pointer, + in bytes. + + :raise IOError: If the path specified by this pointer does + not contain a readable file. + """ + + @abstractmethod + def join(self, fileid): + """ + Return a new path pointer formed by starting at the path + identified by this pointer, and then following the relative + path given by ``fileid``. The path components of ``fileid`` + should be separated by forward slashes, regardless of + the underlying file system's path separator character. + """ + + +class FileSystemPathPointer(PathPointer, str): + """ + A path pointer that identifies a file which can be accessed + directly via a given absolute path. + """ + + @py3_data + def __init__(self, _path): + """ + Create a new path pointer for the given absolute path. + + :raise IOError: If the given path does not exist. + """ + + _path = os.path.abspath(_path) + if not os.path.exists(_path): + raise OSError("No such file or directory: %r" % _path) + self._path = _path + + # There's no need to call str.__init__(), since it's a no-op; + # str does all of its setup work in __new__. + + @property + def path(self): + """The absolute path identified by this path pointer.""" + return self._path + + def open(self, encoding=None): + stream = open(self._path, "rb") + if encoding is not None: + stream = SeekableUnicodeStreamReader(stream, encoding) + return stream + + def file_size(self): + return os.stat(self._path).st_size + + def join(self, fileid): + _path = os.path.join(self._path, fileid) + return FileSystemPathPointer(_path) + + def __repr__(self): + return "FileSystemPathPointer(%r)" % self._path + + def __str__(self): + return self._path + + +@deprecated("Use gzip.GzipFile instead as it also uses a buffer.") +class BufferedGzipFile(GzipFile): + """A ``GzipFile`` subclass for compatibility with older nltk releases. + + Use ``GzipFile`` directly as it also buffers in all supported + Python versions. + """ + + @py3_data + def __init__( + self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs + ): + """Return a buffered gzip file object.""" + GzipFile.__init__(self, filename, mode, compresslevel, fileobj) + + def write(self, data): + # This is identical to GzipFile.write but does not return + # the bytes written to retain compatibility. + super().write(data) + + +class GzipFileSystemPathPointer(FileSystemPathPointer): + """ + A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed + file located at a given absolute path. ``GzipFileSystemPathPointer`` is + appropriate for loading large gzip-compressed pickle objects efficiently. + """ + + def open(self, encoding=None): + stream = GzipFile(self._path, "rb") + if encoding: + stream = SeekableUnicodeStreamReader(stream, encoding) + return stream + + +class ZipFilePathPointer(PathPointer): + """ + A path pointer that identifies a file contained within a zipfile, + which can be accessed by reading that zipfile. + """ + + @py3_data + def __init__(self, zipfile, entry=""): + """ + Create a new path pointer pointing at the specified entry + in the given zipfile. + + :raise IOError: If the given zipfile does not exist, or if it + does not contain the specified entry. + """ + if isinstance(zipfile, str): + zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) + + # Check that the entry exists: + if entry: + + # Normalize the entry string, it should be relative: + entry = normalize_resource_name(entry, True, "/").lstrip("/") + + try: + zipfile.getinfo(entry) + except Exception as e: + # Sometimes directories aren't explicitly listed in + # the zip file. So if `entry` is a directory name, + # then check if the zipfile contains any files that + # are under the given directory. + if entry.endswith("/") and [ + n for n in zipfile.namelist() if n.startswith(entry) + ]: + pass # zipfile contains a file in that directory. + else: + # Otherwise, complain. + raise OSError( + f"Zipfile {zipfile.filename!r} does not contain {entry!r}" + ) from e + self._zipfile = zipfile + self._entry = entry + + @property + def zipfile(self): + """ + The zipfile.ZipFile object used to access the zip file + containing the entry identified by this path pointer. + """ + return self._zipfile + + @property + def entry(self): + """ + The name of the file within zipfile that this path + pointer points to. + """ + return self._entry + + def open(self, encoding=None): + data = self._zipfile.read(self._entry) + stream = BytesIO(data) + if self._entry.endswith(".gz"): + stream = GzipFile(self._entry, fileobj=stream) + elif encoding is not None: + stream = SeekableUnicodeStreamReader(stream, encoding) + return stream + + def file_size(self): + return self._zipfile.getinfo(self._entry).file_size + + def join(self, fileid): + entry = f"{self._entry}/{fileid}" + return ZipFilePathPointer(self._zipfile, entry) + + def __repr__(self): + return f"ZipFilePathPointer({self._zipfile.filename!r}, {self._entry!r})" + + def __str__(self): + return os.path.normpath(os.path.join(self._zipfile.filename, self._entry)) + + +###################################################################### +# Access Functions +###################################################################### + +# Don't use a weak dictionary, because in the common case this +# causes a lot more reloading that necessary. +_resource_cache = {} +"""A dictionary used to cache resources so that they won't + need to be loaded more than once.""" + + +def find(resource_name, paths=None): + """ + Find the given resource by searching through the directories and + zip files in paths, where a None or empty string specifies an absolute path. + Returns a corresponding path name. If the given resource is not + found, raise a ``LookupError``, whose message gives a pointer to + the installation instructions for the NLTK downloader. + + Zip File Handling: + + - If ``resource_name`` contains a component with a ``.zip`` + extension, then it is assumed to be a zipfile; and the + remaining path components are used to look inside the zipfile. + + - If any element of ``nltk.data.path`` has a ``.zip`` extension, + then it is assumed to be a zipfile. + + - If a given resource name that does not contain any zipfile + component is not found initially, then ``find()`` will make a + second attempt to find that resource, by replacing each + component *p* in the path with *p.zip/p*. For example, this + allows ``find()`` to map the resource name + ``corpora/chat80/cities.pl`` to a zip file path pointer to + ``corpora/chat80.zip/chat80/cities.pl``. + + - When using ``find()`` to locate a directory contained in a + zipfile, the resource name must end with the forward slash + character. Otherwise, ``find()`` will not locate the + directory. + + :type resource_name: str or unicode + :param resource_name: The name of the resource to search for. + Resource names are posix-style relative path names, such as + ``corpora/brown``. Directory names will be + automatically converted to a platform-appropriate path separator. + :rtype: str + """ + resource_name = normalize_resource_name(resource_name, True) + + # Resolve default paths at runtime in-case the user overrides + # nltk.data.path + if paths is None: + paths = path + + # Check if the resource name includes a zipfile name + m = re.match(r"(.*\.zip)/?(.*)$|", resource_name) + zipfile, zipentry = m.groups() + + # Check each item in our path + for path_ in paths: + # Is the path item a zipfile? + if path_ and (os.path.isfile(path_) and path_.endswith(".zip")): + try: + return ZipFilePathPointer(path_, resource_name) + except OSError: + # resource not in zipfile + continue + + # Is the path item a directory or is resource_name an absolute path? + elif not path_ or os.path.isdir(path_): + if zipfile is None: + p = os.path.join(path_, url2pathname(resource_name)) + if os.path.exists(p): + if p.endswith(".gz"): + return GzipFileSystemPathPointer(p) + else: + return FileSystemPathPointer(p) + else: + p = os.path.join(path_, url2pathname(zipfile)) + if os.path.exists(p): + try: + return ZipFilePathPointer(p, zipentry) + except OSError: + # resource not in zipfile + continue + + # Fallback: if the path doesn't include a zip file, then try + # again, assuming that one of the path components is inside a + # zipfile of the same name. + if zipfile is None: + pieces = resource_name.split("/") + for i in range(len(pieces)): + modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:]) + try: + return find(modified_name, paths) + except LookupError: + pass + + # Identify the package (i.e. the .zip file) to download. + resource_zipname = resource_name.split("/")[1] + if resource_zipname.endswith(".zip"): + resource_zipname = resource_zipname.rpartition(".")[0] + # Display a friendly error message if the resource wasn't found: + msg = str( + "Resource \33[93m{resource}\033[0m not found.\n" + "Please use the NLTK Downloader to obtain the resource:\n\n" + "\33[31m" # To display red text in terminal. + ">>> import nltk\n" + ">>> nltk.download('{resource}')\n" + "\033[0m" + ).format(resource=resource_zipname) + msg = textwrap_indent(msg) + + msg += "\n For more information see: https://www.nltk.org/data.html\n" + + msg += "\n Attempted to load \33[93m{resource_name}\033[0m\n".format( + resource_name=resource_name + ) + + msg += "\n Searched in:" + "".join("\n - %r" % d for d in paths) + sep = "*" * 70 + resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" + raise LookupError(resource_not_found) + + +def retrieve(resource_url, filename=None, verbose=True): + """ + Copy the given resource to a local file. If no filename is + specified, then use the URL's filename. If there is already a + file named ``filename``, then raise a ``ValueError``. + + :type resource_url: str + :param resource_url: A URL specifying where the resource should be + loaded from. The default protocol is "nltk:", which searches + for the file in the the NLTK data package. + """ + resource_url = normalize_resource_url(resource_url) + if filename is None: + if resource_url.startswith("file:"): + filename = os.path.split(resource_url)[-1] + else: + filename = re.sub(r"(^\w+:)?.*/", "", resource_url) + if os.path.exists(filename): + filename = os.path.abspath(filename) + raise ValueError("File %r already exists!" % filename) + + if verbose: + print(f"Retrieving {resource_url!r}, saving to {filename!r}") + + # Open the input & output streams. + infile = _open(resource_url) + + # Copy infile -> outfile, using 64k blocks. + with open(filename, "wb") as outfile: + while True: + s = infile.read(1024 * 64) # 64k blocks. + outfile.write(s) + if not s: + break + + infile.close() + + +#: A dictionary describing the formats that are supported by NLTK's +#: load() method. Keys are format names, and values are format +#: descriptions. +FORMATS = { + "pickle": "A serialized python object, stored using the pickle module.", + "json": "A serialized python object, stored using the json module.", + "yaml": "A serialized python object, stored using the yaml module.", + "cfg": "A context free grammar.", + "pcfg": "A probabilistic CFG.", + "fcfg": "A feature CFG.", + "fol": "A list of first order logic expressions, parsed with " + "nltk.sem.logic.Expression.fromstring.", + "logic": "A list of first order logic expressions, parsed with " + "nltk.sem.logic.LogicParser. Requires an additional logic_parser " + "parameter", + "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.", + "raw": "The raw (byte string) contents of a file.", + "text": "The raw (unicode string) contents of a file. ", +} + +#: A dictionary mapping from file extensions to format names, used +#: by load() when format="auto" to decide the format for a +#: given resource url. +AUTO_FORMATS = { + "pickle": "pickle", + "json": "json", + "yaml": "yaml", + "cfg": "cfg", + "pcfg": "pcfg", + "fcfg": "fcfg", + "fol": "fol", + "logic": "logic", + "val": "val", + "txt": "text", + "text": "text", +} + + +def load( + resource_url, + format="auto", + cache=True, + verbose=False, + logic_parser=None, + fstruct_reader=None, + encoding=None, +): + """ + Load a given resource from the NLTK data package. The following + resource formats are currently supported: + + - ``pickle`` + - ``json`` + - ``yaml`` + - ``cfg`` (context free grammars) + - ``pcfg`` (probabilistic CFGs) + - ``fcfg`` (feature-based CFGs) + - ``fol`` (formulas of First Order Logic) + - ``logic`` (Logical formulas to be parsed by the given logic_parser) + - ``val`` (valuation of First Order Logic model) + - ``text`` (the file contents as a unicode string) + - ``raw`` (the raw file contents as a byte string) + + If no format is specified, ``load()`` will attempt to determine a + format based on the resource name's file extension. If that + fails, ``load()`` will raise a ``ValueError`` exception. + + For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``), + it tries to decode the raw contents using UTF-8, and if that doesn't + work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding`` + is specified. + + :type resource_url: str + :param resource_url: A URL specifying where the resource should be + loaded from. The default protocol is "nltk:", which searches + for the file in the the NLTK data package. + :type cache: bool + :param cache: If true, add this resource to a cache. If load() + finds a resource in its cache, then it will return it from the + cache rather than loading it. + :type verbose: bool + :param verbose: If true, print a message when loading a resource. + Messages are not displayed when a resource is retrieved from + the cache. + :type logic_parser: LogicParser + :param logic_parser: The parser that will be used to parse logical + expressions. + :type fstruct_reader: FeatStructReader + :param fstruct_reader: The parser that will be used to parse the + feature structure of an fcfg. + :type encoding: str + :param encoding: the encoding of the input; only used for text formats. + """ + resource_url = normalize_resource_url(resource_url) + resource_url = add_py3_data(resource_url) + + # Determine the format of the resource. + if format == "auto": + resource_url_parts = resource_url.split(".") + ext = resource_url_parts[-1] + if ext == "gz": + ext = resource_url_parts[-2] + format = AUTO_FORMATS.get(ext) + if format is None: + raise ValueError( + "Could not determine format for %s based " + 'on its file\nextension; use the "format" ' + "argument to specify the format explicitly." % resource_url + ) + + if format not in FORMATS: + raise ValueError(f"Unknown format type: {format}!") + + # If we've cached the resource, then just return it. + if cache: + resource_val = _resource_cache.get((resource_url, format)) + if resource_val is not None: + if verbose: + print(f"<>") + return resource_val + + # Let the user know what's going on. + if verbose: + print(f"<>") + + # Load the resource. + opened_resource = _open(resource_url) + + if format == "raw": + resource_val = opened_resource.read() + elif format == "pickle": + resource_val = pickle.load(opened_resource) + elif format == "json": + import json + + from nltk.jsontags import json_tags + + resource_val = json.load(opened_resource) + tag = None + if len(resource_val) != 1: + tag = next(resource_val.keys()) + if tag not in json_tags: + raise ValueError("Unknown json tag.") + elif format == "yaml": + import yaml + + resource_val = yaml.safe_load(opened_resource) + else: + # The resource is a text format. + binary_data = opened_resource.read() + if encoding is not None: + string_data = binary_data.decode(encoding) + else: + try: + string_data = binary_data.decode("utf-8") + except UnicodeDecodeError: + string_data = binary_data.decode("latin-1") + if format == "text": + resource_val = string_data + elif format == "cfg": + resource_val = grammar.CFG.fromstring(string_data, encoding=encoding) + elif format == "pcfg": + resource_val = grammar.PCFG.fromstring(string_data, encoding=encoding) + elif format == "fcfg": + resource_val = grammar.FeatureGrammar.fromstring( + string_data, + logic_parser=logic_parser, + fstruct_reader=fstruct_reader, + encoding=encoding, + ) + elif format == "fol": + resource_val = sem.read_logic( + string_data, + logic_parser=sem.logic.LogicParser(), + encoding=encoding, + ) + elif format == "logic": + resource_val = sem.read_logic( + string_data, logic_parser=logic_parser, encoding=encoding + ) + elif format == "val": + resource_val = sem.read_valuation(string_data, encoding=encoding) + else: + raise AssertionError( + "Internal NLTK error: Format %s isn't " + "handled by nltk.data.load()" % (format,) + ) + + opened_resource.close() + + # If requested, add it to the cache. + if cache: + try: + _resource_cache[(resource_url, format)] = resource_val + # TODO: add this line + # print('<>' % (resource_url,)) + except TypeError: + # We can't create weak references to some object types, like + # strings and tuples. For now, just don't cache them. + pass + + return resource_val + + +def show_cfg(resource_url, escape="##"): + """ + Write out a grammar file, ignoring escaped and empty lines. + + :type resource_url: str + :param resource_url: A URL specifying where the resource should be + loaded from. The default protocol is "nltk:", which searches + for the file in the the NLTK data package. + :type escape: str + :param escape: Prepended string that signals lines to be ignored + """ + resource_url = normalize_resource_url(resource_url) + resource_val = load(resource_url, format="text", cache=False) + lines = resource_val.splitlines() + for l in lines: + if l.startswith(escape): + continue + if re.match("^$", l): + continue + print(l) + + +def clear_cache(): + """ + Remove all objects from the resource cache. + :see: load() + """ + _resource_cache.clear() + + +def _open(resource_url): + """ + Helper function that returns an open file object for a resource, + given its resource URL. If the given resource URL uses the "nltk:" + protocol, or uses no protocol, then use ``nltk.data.find`` to find + its path, and open it with the given mode; if the resource URL + uses the 'file' protocol, then open the file with the given mode; + otherwise, delegate to ``urllib2.urlopen``. + + :type resource_url: str + :param resource_url: A URL specifying where the resource should be + loaded from. The default protocol is "nltk:", which searches + for the file in the the NLTK data package. + """ + resource_url = normalize_resource_url(resource_url) + protocol, path_ = split_resource_url(resource_url) + + if protocol is None or protocol.lower() == "nltk": + return find(path_, path + [""]).open() + elif protocol.lower() == "file": + # urllib might not use mode='rb', so handle this one ourselves: + return find(path_, [""]).open() + else: + return urlopen(resource_url) + + +###################################################################### +# Lazy Resource Loader +###################################################################### + + +class LazyLoader: + @py3_data + def __init__(self, _path): + self._path = _path + + def __load(self): + resource = load(self._path) + # This is where the magic happens! Transform ourselves into + # the object by modifying our own __dict__ and __class__ to + # match that of `resource`. + self.__dict__ = resource.__dict__ + self.__class__ = resource.__class__ + + def __getattr__(self, attr): + self.__load() + # This looks circular, but its not, since __load() changes our + # __class__ to something new: + return getattr(self, attr) + + def __repr__(self): + self.__load() + # This looks circular, but its not, since __load() changes our + # __class__ to something new: + return repr(self) + + +###################################################################### +# Open-On-Demand ZipFile +###################################################################### + + +class OpenOnDemandZipFile(zipfile.ZipFile): + """ + A subclass of ``zipfile.ZipFile`` that closes its file pointer + whenever it is not using it; and re-opens it when it needs to read + data from the zipfile. This is useful for reducing the number of + open file handles when many zip files are being accessed at once. + ``OpenOnDemandZipFile`` must be constructed from a filename, not a + file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is + read-only (i.e. ``write()`` and ``writestr()`` are disabled. + """ + + @py3_data + def __init__(self, filename): + if not isinstance(filename, str): + raise TypeError("ReopenableZipFile filename must be a string") + zipfile.ZipFile.__init__(self, filename) + assert self.filename == filename + self.close() + # After closing a ZipFile object, the _fileRefCnt needs to be cleared + # for Python2and3 compatible code. + self._fileRefCnt = 0 + + def read(self, name): + assert self.fp is None + self.fp = open(self.filename, "rb") + value = zipfile.ZipFile.read(self, name) + # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code. + # Since we only opened one file here, we add 1. + self._fileRefCnt += 1 + self.close() + return value + + def write(self, *args, **kwargs): + """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" + raise NotImplementedError("OpenOnDemandZipfile is read-only") + + def writestr(self, *args, **kwargs): + """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" + raise NotImplementedError("OpenOnDemandZipfile is read-only") + + def __repr__(self): + return repr("OpenOnDemandZipFile(%r)" % self.filename) + + +###################################################################### +# Seekable Unicode Stream Reader +###################################################################### + + +class SeekableUnicodeStreamReader: + """ + A stream reader that automatically encodes the source byte stream + into unicode (like ``codecs.StreamReader``); but still supports the + ``seek()`` and ``tell()`` operations correctly. This is in contrast + to ``codecs.StreamReader``, which provide *broken* ``seek()`` and + ``tell()`` methods. + + This class was motivated by ``StreamBackedCorpusView``, which + makes extensive use of ``seek()`` and ``tell()``, and needs to be + able to handle unicode-encoded files. + + Note: this class requires stateless decoders. To my knowledge, + this shouldn't cause a problem with any of python's builtin + unicode encodings. + """ + + DEBUG = True # : If true, then perform extra sanity checks. + + @py3_data + def __init__(self, stream, encoding, errors="strict"): + # Rewind the stream to its beginning. + stream.seek(0) + + self.stream = stream + """The underlying stream.""" + + self.encoding = encoding + """The name of the encoding that should be used to encode the + underlying stream.""" + + self.errors = errors + """The error mode that should be used when decoding data from + the underlying stream. Can be 'strict', 'ignore', or + 'replace'.""" + + self.decode = codecs.getdecoder(encoding) + """The function that is used to decode byte strings into + unicode strings.""" + + self.bytebuffer = b"" + """A buffer to use bytes that have been read but have not yet + been decoded. This is only used when the final bytes from + a read do not form a complete encoding for a character.""" + + self.linebuffer = None + """A buffer used by ``readline()`` to hold characters that have + been read, but have not yet been returned by ``read()`` or + ``readline()``. This buffer consists of a list of unicode + strings, where each string corresponds to a single line. + The final element of the list may or may not be a complete + line. Note that the existence of a linebuffer makes the + ``tell()`` operation more complex, because it must backtrack + to the beginning of the buffer to determine the correct + file position in the underlying byte stream.""" + + self._rewind_checkpoint = 0 + """The file position at which the most recent read on the + underlying stream began. This is used, together with + ``_rewind_numchars``, to backtrack to the beginning of + ``linebuffer`` (which is required by ``tell()``).""" + + self._rewind_numchars = None + """The number of characters that have been returned since the + read that started at ``_rewind_checkpoint``. This is used, + together with ``_rewind_checkpoint``, to backtrack to the + beginning of ``linebuffer`` (which is required by ``tell()``).""" + + self._bom = self._check_bom() + """The length of the byte order marker at the beginning of + the stream (or None for no byte order marker).""" + + # ///////////////////////////////////////////////////////////////// + # Read methods + # ///////////////////////////////////////////////////////////////// + + def read(self, size=None): + """ + Read up to ``size`` bytes, decode them using this reader's + encoding, and return the resulting unicode string. + + :param size: The maximum number of bytes to read. If not + specified, then read as many bytes as possible. + :type size: int + :rtype: unicode + """ + chars = self._read(size) + + # If linebuffer is not empty, then include it in the result + if self.linebuffer: + chars = "".join(self.linebuffer) + chars + self.linebuffer = None + self._rewind_numchars = None + + return chars + + def discard_line(self): + if self.linebuffer and len(self.linebuffer) > 1: + line = self.linebuffer.pop(0) + self._rewind_numchars += len(line) + else: + self.stream.readline() + + def readline(self, size=None): + """ + Read a line of text, decode it using this reader's encoding, + and return the resulting unicode string. + + :param size: The maximum number of bytes to read. If no + newline is encountered before ``size`` bytes have been read, + then the returned value may not be a complete line of text. + :type size: int + """ + # If we have a non-empty linebuffer, then return the first + # line from it. (Note that the last element of linebuffer may + # not be a complete line; so let _read() deal with it.) + if self.linebuffer and len(self.linebuffer) > 1: + line = self.linebuffer.pop(0) + self._rewind_numchars += len(line) + return line + + readsize = size or 72 + chars = "" + + # If there's a remaining incomplete line in the buffer, add it. + if self.linebuffer: + chars += self.linebuffer.pop() + self.linebuffer = None + + while True: + startpos = self.stream.tell() - len(self.bytebuffer) + new_chars = self._read(readsize) + + # If we're at a '\r', then read one extra character, since + # it might be a '\n', to get the proper line ending. + if new_chars and new_chars.endswith("\r"): + new_chars += self._read(1) + + chars += new_chars + lines = chars.splitlines(True) + if len(lines) > 1: + line = lines[0] + self.linebuffer = lines[1:] + self._rewind_numchars = len(new_chars) - (len(chars) - len(line)) + self._rewind_checkpoint = startpos + break + elif len(lines) == 1: + line0withend = lines[0] + line0withoutend = lines[0].splitlines(False)[0] + if line0withend != line0withoutend: # complete line + line = line0withend + break + + if not new_chars or size is not None: + line = chars + break + + # Read successively larger blocks of text. + if readsize < 8000: + readsize *= 2 + + return line + + def readlines(self, sizehint=None, keepends=True): + """ + Read this file's contents, decode them using this reader's + encoding, and return it as a list of unicode lines. + + :rtype: list(unicode) + :param sizehint: Ignored. + :param keepends: If false, then strip newlines. + """ + return self.read().splitlines(keepends) + + def next(self): + """Return the next decoded line from the underlying stream.""" + line = self.readline() + if line: + return line + else: + raise StopIteration + + def __next__(self): + return self.next() + + def __iter__(self): + """Return self""" + return self + + def __del__(self): + # let garbage collector deal with still opened streams + if not self.closed: + self.close() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.close() + + def xreadlines(self): + """Return self""" + return self + + # ///////////////////////////////////////////////////////////////// + # Pass-through methods & properties + # ///////////////////////////////////////////////////////////////// + + @property + def closed(self): + """True if the underlying stream is closed.""" + return self.stream.closed + + @property + def name(self): + """The name of the underlying stream.""" + return self.stream.name + + @property + def mode(self): + """The mode of the underlying stream.""" + return self.stream.mode + + def close(self): + """ + Close the underlying stream. + """ + self.stream.close() + + # ///////////////////////////////////////////////////////////////// + # Seek and tell + # ///////////////////////////////////////////////////////////////// + + def seek(self, offset, whence=0): + """ + Move the stream to a new file position. If the reader is + maintaining any buffers, then they will be cleared. + + :param offset: A byte count offset. + :param whence: If 0, then the offset is from the start of the file + (offset should be positive), if 1, then the offset is from the + current position (offset may be positive or negative); and if 2, + then the offset is from the end of the file (offset should + typically be negative). + """ + if whence == 1: + raise ValueError( + "Relative seek is not supported for " + "SeekableUnicodeStreamReader -- consider " + "using char_seek_forward() instead." + ) + self.stream.seek(offset, whence) + self.linebuffer = None + self.bytebuffer = b"" + self._rewind_numchars = None + self._rewind_checkpoint = self.stream.tell() + + def char_seek_forward(self, offset): + """ + Move the read pointer forward by ``offset`` characters. + """ + if offset < 0: + raise ValueError("Negative offsets are not supported") + # Clear all buffers. + self.seek(self.tell()) + # Perform the seek operation. + self._char_seek_forward(offset) + + def _char_seek_forward(self, offset, est_bytes=None): + """ + Move the file position forward by ``offset`` characters, + ignoring all buffers. + + :param est_bytes: A hint, giving an estimate of the number of + bytes that will be needed to move forward by ``offset`` chars. + Defaults to ``offset``. + """ + if est_bytes is None: + est_bytes = offset + bytes = b"" + + while True: + # Read in a block of bytes. + newbytes = self.stream.read(est_bytes - len(bytes)) + bytes += newbytes + + # Decode the bytes to characters. + chars, bytes_decoded = self._incr_decode(bytes) + + # If we got the right number of characters, then seek + # backwards over any truncated characters, and return. + if len(chars) == offset: + self.stream.seek(-len(bytes) + bytes_decoded, 1) + return + + # If we went too far, then we can back-up until we get it + # right, using the bytes we've already read. + if len(chars) > offset: + while len(chars) > offset: + # Assume at least one byte/char. + est_bytes += offset - len(chars) + chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) + self.stream.seek(-len(bytes) + bytes_decoded, 1) + return + + # Otherwise, we haven't read enough bytes yet; loop again. + est_bytes += offset - len(chars) + + def tell(self): + """ + Return the current file position on the underlying byte + stream. If this reader is maintaining any buffers, then the + returned file position will be the position of the beginning + of those buffers. + """ + # If nothing's buffered, then just return our current filepos: + if self.linebuffer is None: + return self.stream.tell() - len(self.bytebuffer) + + # Otherwise, we'll need to backtrack the filepos until we + # reach the beginning of the buffer. + + # Store our original file position, so we can return here. + orig_filepos = self.stream.tell() + + # Calculate an estimate of where we think the newline is. + bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint + buf_size = sum(len(line) for line in self.linebuffer) + est_bytes = int( + bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size) + ) + + self.stream.seek(self._rewind_checkpoint) + self._char_seek_forward(self._rewind_numchars, est_bytes) + filepos = self.stream.tell() + + # Sanity check + if self.DEBUG: + self.stream.seek(filepos) + check1 = self._incr_decode(self.stream.read(50))[0] + check2 = "".join(self.linebuffer) + assert check1.startswith(check2) or check2.startswith(check1) + + # Return to our original filepos (so we don't have to throw + # out our buffer.) + self.stream.seek(orig_filepos) + + # Return the calculated filepos + return filepos + + # ///////////////////////////////////////////////////////////////// + # Helper methods + # ///////////////////////////////////////////////////////////////// + + def _read(self, size=None): + """ + Read up to ``size`` bytes from the underlying stream, decode + them using this reader's encoding, and return the resulting + unicode string. ``linebuffer`` is not included in the result. + """ + if size == 0: + return "" + + # Skip past the byte order marker, if present. + if self._bom and self.stream.tell() == 0: + self.stream.read(self._bom) + + # Read the requested number of bytes. + if size is None: + new_bytes = self.stream.read() + else: + new_bytes = self.stream.read(size) + bytes = self.bytebuffer + new_bytes + + # Decode the bytes into unicode characters + chars, bytes_decoded = self._incr_decode(bytes) + + # If we got bytes but couldn't decode any, then read further. + if (size is not None) and (not chars) and (len(new_bytes) > 0): + while not chars: + new_bytes = self.stream.read(1) + if not new_bytes: + break # end of file. + bytes += new_bytes + chars, bytes_decoded = self._incr_decode(bytes) + + # Record any bytes we didn't consume. + self.bytebuffer = bytes[bytes_decoded:] + + # Return the result + return chars + + def _incr_decode(self, bytes): + """ + Decode the given byte string into a unicode string, using this + reader's encoding. If an exception is encountered that + appears to be caused by a truncation error, then just decode + the byte string without the bytes that cause the trunctaion + error. + + Return a tuple ``(chars, num_consumed)``, where ``chars`` is + the decoded unicode string, and ``num_consumed`` is the + number of bytes that were consumed. + """ + while True: + try: + return self.decode(bytes, "strict") + except UnicodeDecodeError as exc: + # If the exception occurs at the end of the string, + # then assume that it's a truncation error. + if exc.end == len(bytes): + return self.decode(bytes[: exc.start], self.errors) + + # Otherwise, if we're being strict, then raise it. + elif self.errors == "strict": + raise + + # If we're not strict, then re-process it with our + # errors setting. This *may* raise an exception. + else: + return self.decode(bytes, self.errors) + + _BOM_TABLE = { + "utf8": [(codecs.BOM_UTF8, None)], + "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")], + "utf16le": [(codecs.BOM_UTF16_LE, None)], + "utf16be": [(codecs.BOM_UTF16_BE, None)], + "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")], + "utf32le": [(codecs.BOM_UTF32_LE, None)], + "utf32be": [(codecs.BOM_UTF32_BE, None)], + } + + def _check_bom(self): + # Normalize our encoding name + enc = re.sub("[ -]", "", self.encoding.lower()) + + # Look up our encoding in the BOM table. + bom_info = self._BOM_TABLE.get(enc) + + if bom_info: + # Read a prefix, to check against the BOM(s) + bytes = self.stream.read(16) + self.stream.seek(0) + + # Check for each possible BOM. + for (bom, new_encoding) in bom_info: + if bytes.startswith(bom): + if new_encoding: + self.encoding = new_encoding + return len(bom) + + return None + + +__all__ = [ + "path", + "PathPointer", + "FileSystemPathPointer", + "BufferedGzipFile", + "GzipFileSystemPathPointer", + "GzipFileSystemPathPointer", + "find", + "retrieve", + "FORMATS", + "AUTO_FORMATS", + "load", + "show_cfg", + "clear_cache", + "LazyLoader", + "OpenOnDemandZipFile", + "GzipFileSystemPathPointer", + "SeekableUnicodeStreamReader", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/decorators.py b/.eggs/nltk-3.8-py3.10.egg/nltk/decorators.py new file mode 100644 index 0000000000000000000000000000000000000000..3a0fae1852afd47a2290b41ce94843aca36aa05f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/decorators.py @@ -0,0 +1,251 @@ +""" +Decorator module by Michele Simionato +Copyright Michele Simionato, distributed under the terms of the BSD License (see below). +http://www.phyast.pitt.edu/~micheles/python/documentation.html + +Included in NLTK for its support of a nice memoization decorator. +""" + +__docformat__ = "restructuredtext en" + +## The basic trick is to generate the source code for the decorated function +## with the right signature and to evaluate it. +## Uncomment the statement 'print >> sys.stderr, func_src' in _decorator +## to understand what is going on. + +__all__ = ["decorator", "new_wrapper", "getinfo"] + +import sys + +# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in +# the Python standard library. +OLD_SYS_PATH = sys.path[:] +sys.path = [p for p in sys.path if p and "nltk" not in str(p)] +import inspect + +sys.path = OLD_SYS_PATH + + +def __legacysignature(signature): + """ + For retrocompatibility reasons, we don't use a standard Signature. + Instead, we use the string generated by this method. + Basically, from a Signature we create a string and remove the default values. + """ + listsignature = str(signature)[1:-1].split(",") + for counter, param in enumerate(listsignature): + if param.count("=") > 0: + listsignature[counter] = param[0 : param.index("=")].strip() + else: + listsignature[counter] = param.strip() + return ", ".join(listsignature) + + +def getinfo(func): + """ + Returns an info dictionary containing: + - name (the name of the function : str) + - argnames (the names of the arguments : list) + - defaults (the values of the default arguments : tuple) + - signature (the signature : str) + - fullsignature (the full signature : Signature) + - doc (the docstring : str) + - module (the module name : str) + - dict (the function __dict__ : str) + + >>> def f(self, x=1, y=2, *args, **kw): pass + + >>> info = getinfo(f) + + >>> info["name"] + 'f' + >>> info["argnames"] + ['self', 'x', 'y', 'args', 'kw'] + + >>> info["defaults"] + (1, 2) + + >>> info["signature"] + 'self, x, y, *args, **kw' + + >>> info["fullsignature"] + + """ + assert inspect.ismethod(func) or inspect.isfunction(func) + argspec = inspect.getfullargspec(func) + regargs, varargs, varkwargs = argspec[:3] + argnames = list(regargs) + if varargs: + argnames.append(varargs) + if varkwargs: + argnames.append(varkwargs) + fullsignature = inspect.signature(func) + # Convert Signature to str + signature = __legacysignature(fullsignature) + + # pypy compatibility + if hasattr(func, "__closure__"): + _closure = func.__closure__ + _globals = func.__globals__ + else: + _closure = func.func_closure + _globals = func.func_globals + + return dict( + name=func.__name__, + argnames=argnames, + signature=signature, + fullsignature=fullsignature, + defaults=func.__defaults__, + doc=func.__doc__, + module=func.__module__, + dict=func.__dict__, + globals=_globals, + closure=_closure, + ) + + +def update_wrapper(wrapper, model, infodict=None): + "akin to functools.update_wrapper" + infodict = infodict or getinfo(model) + wrapper.__name__ = infodict["name"] + wrapper.__doc__ = infodict["doc"] + wrapper.__module__ = infodict["module"] + wrapper.__dict__.update(infodict["dict"]) + wrapper.__defaults__ = infodict["defaults"] + wrapper.undecorated = model + return wrapper + + +def new_wrapper(wrapper, model): + """ + An improvement over functools.update_wrapper. The wrapper is a generic + callable object. It works by generating a copy of the wrapper with the + right signature and by updating the copy, not the original. + Moreovoer, 'model' can be a dictionary with keys 'name', 'doc', 'module', + 'dict', 'defaults'. + """ + if isinstance(model, dict): + infodict = model + else: # assume model is a function + infodict = getinfo(model) + assert ( + not "_wrapper_" in infodict["argnames"] + ), '"_wrapper_" is a reserved argument name!' + src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict + funcopy = eval(src, dict(_wrapper_=wrapper)) + return update_wrapper(funcopy, model, infodict) + + +# helper used in decorator_factory +def __call__(self, func): + return new_wrapper(lambda *a, **k: self.call(func, *a, **k), func) + + +def decorator_factory(cls): + """ + Take a class with a ``.caller`` method and return a callable decorator + object. It works by adding a suitable __call__ method to the class; + it raises a TypeError if the class already has a nontrivial __call__ + method. + """ + attrs = set(dir(cls)) + if "__call__" in attrs: + raise TypeError( + "You cannot decorate a class with a nontrivial " "__call__ method" + ) + if "call" not in attrs: + raise TypeError("You cannot decorate a class without a " ".call method") + cls.__call__ = __call__ + return cls + + +def decorator(caller): + """ + General purpose decorator factory: takes a caller function as + input and returns a decorator with the same attributes. + A caller function is any function like this:: + + def caller(func, *args, **kw): + # do something + return func(*args, **kw) + + Here is an example of usage: + + >>> @decorator + ... def chatty(f, *args, **kw): + ... print("Calling %r" % f.__name__) + ... return f(*args, **kw) + + >>> chatty.__name__ + 'chatty' + + >>> @chatty + ... def f(): pass + ... + >>> f() + Calling 'f' + + decorator can also take in input a class with a .caller method; in this + case it converts the class into a factory of callable decorator objects. + See the documentation for an example. + """ + if inspect.isclass(caller): + return decorator_factory(caller) + + def _decorator(func): # the real meat is here + infodict = getinfo(func) + argnames = infodict["argnames"] + assert not ( + "_call_" in argnames or "_func_" in argnames + ), "You cannot use _call_ or _func_ as argument names!" + src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict + # import sys; print >> sys.stderr, src # for debugging purposes + dec_func = eval(src, dict(_func_=func, _call_=caller)) + return update_wrapper(dec_func, func, infodict) + + return update_wrapper(_decorator, caller) + + +def getattr_(obj, name, default_thunk): + "Similar to .setdefault in dictionaries." + try: + return getattr(obj, name) + except AttributeError: + default = default_thunk() + setattr(obj, name, default) + return default + + +@decorator +def memoize(func, *args): + dic = getattr_(func, "memoize_dic", dict) + # memoize_dic is created at the first call + if args in dic: + return dic[args] + result = func(*args) + dic[args] = result + return result + + +########################## LEGALESE ############################### + +## Redistributions of source code must retain the above copyright +## notice, this list of conditions and the following disclaimer. +## Redistributions in bytecode form must reproduce the above copyright +## notice, this list of conditions and the following disclaimer in +## the documentation and/or other materials provided with the +## distribution. + +## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +## HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +## INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +## BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +## OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +## TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +## USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +## DAMAGE. diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/downloader.py b/.eggs/nltk-3.8-py3.10.egg/nltk/downloader.py new file mode 100644 index 0000000000000000000000000000000000000000..bc9677d3d86a4462396fff346ec19b9fbad811e3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/downloader.py @@ -0,0 +1,2559 @@ +# Natural Language Toolkit: Corpus & Model Downloader +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +The NLTK corpus and module downloader. This module defines several +interfaces which can be used to download corpora, models, and other +data packages that can be used with NLTK. + +Downloading Packages +==================== +If called with no arguments, ``download()`` will display an interactive +interface which can be used to download and install new packages. +If Tkinter is available, then a graphical interface will be shown, +otherwise a simple text interface will be provided. + +Individual packages can be downloaded by calling the ``download()`` +function with a single argument, giving the package identifier for the +package that should be downloaded: + + >>> download('treebank') # doctest: +SKIP + [nltk_data] Downloading package 'treebank'... + [nltk_data] Unzipping corpora/treebank.zip. + +NLTK also provides a number of \"package collections\", consisting of +a group of related packages. To download all packages in a +colleciton, simply call ``download()`` with the collection's +identifier: + + >>> download('all-corpora') # doctest: +SKIP + [nltk_data] Downloading package 'abc'... + [nltk_data] Unzipping corpora/abc.zip. + [nltk_data] Downloading package 'alpino'... + [nltk_data] Unzipping corpora/alpino.zip. + ... + [nltk_data] Downloading package 'words'... + [nltk_data] Unzipping corpora/words.zip. + +Download Directory +================== +By default, packages are installed in either a system-wide directory +(if Python has sufficient access to write to it); or in the current +user's home directory. However, the ``download_dir`` argument may be +used to specify a different installation target, if desired. + +See ``Downloader.default_download_dir()`` for more a detailed +description of how the default download directory is chosen. + +NLTK Download Server +==================== +Before downloading any packages, the corpus and module downloader +contacts the NLTK download server, to retrieve an index file +describing the available packages. By default, this index file is +loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``. +If necessary, it is possible to create a new ``Downloader`` object, +specifying a different URL for the package index file. + +Usage:: + + python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS + +or:: + + python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS +""" +# ---------------------------------------------------------------------- + +""" + + 0 1 2 3 +[label][----][label][----] +[column ][column ] + +Notes +===== +Handling data files.. Some questions: + +* Should the data files be kept zipped or unzipped? I say zipped. + +* Should the data files be kept in svn at all? Advantages: history; + automatic version numbers; 'svn up' could be used rather than the + downloader to update the corpora. Disadvantages: they're big, + which makes working from svn a bit of a pain. And we're planning + to potentially make them much bigger. I don't think we want + people to have to download 400MB corpora just to use nltk from svn. + +* Compromise: keep the data files in trunk/data rather than in + trunk/nltk. That way you can check them out in svn if you want + to; but you don't need to, and you can use the downloader instead. + +* Also: keep models in mind. When we change the code, we'd + potentially like the models to get updated. This could require a + little thought. + +* So.. let's assume we have a trunk/data directory, containing a bunch + of packages. The packages should be kept as zip files, because we + really shouldn't be editing them much (well -- we may edit models + more, but they tend to be binary-ish files anyway, where diffs + aren't that helpful). So we'll have trunk/data, with a bunch of + files like abc.zip and treebank.zip and propbank.zip. For each + package we could also have eg treebank.xml and propbank.xml, + describing the contents of the package (name, copyright, license, + etc). Collections would also have .xml files. Finally, we would + pull all these together to form a single index.xml file. Some + directory structure wouldn't hurt. So how about:: + + /trunk/data/ ....................... root of data svn + index.xml ........................ main index file + src/ ............................. python scripts + packages/ ........................ dir for packages + corpora/ ....................... zip & xml files for corpora + grammars/ ...................... zip & xml files for grammars + taggers/ ....................... zip & xml files for taggers + tokenizers/ .................... zip & xml files for tokenizers + etc. + collections/ ..................... xml files for collections + + Where the root (/trunk/data) would contain a makefile; and src/ + would contain a script to update the info.xml file. It could also + contain scripts to rebuild some of the various model files. The + script that builds index.xml should probably check that each zip + file expands entirely into a single subdir, whose name matches the + package's uid. + +Changes I need to make: + - in index: change "size" to "filesize" or "compressed-size" + - in index: add "unzipped-size" + - when checking status: check both compressed & uncompressed size. + uncompressed size is important to make sure we detect a problem + if something got partially unzipped. define new status values + to differentiate stale vs corrupt vs corruptly-uncompressed?? + (we shouldn't need to re-download the file if the zip file is ok + but it didn't get uncompressed fully.) + - add other fields to the index: author, license, copyright, contact, + etc. + +the current grammars/ package would become a single new package (eg +toy-grammars or book-grammars). + +xml file should have: + - authorship info + - license info + - copyright info + - contact info + - info about what type of data/annotation it contains? + - recommended corpus reader? + +collections can contain other collections. they can also contain +multiple package types (corpora & models). Have a single 'basics' +package that includes everything we talk about in the book? + +n.b.: there will have to be a fallback to the punkt tokenizer, in case +they didn't download that model. + +default: unzip or not? + +""" +import functools +import itertools +import os +import shutil +import subprocess +import sys +import textwrap +import threading +import time +import warnings +import zipfile +from hashlib import md5 +from xml.etree import ElementTree + +try: + TKINTER = True + from tkinter import Button, Canvas, Entry, Frame, IntVar, Label, Menu, TclError, Tk + from tkinter.messagebox import showerror + + from nltk.draw.table import Table + from nltk.draw.util import ShowText +except ImportError: + TKINTER = False + TclError = ValueError + +from urllib.error import HTTPError, URLError +from urllib.request import urlopen + +import nltk + +# urllib2 = nltk.internals.import_from_stdlib('urllib2') + + +###################################################################### +# Directory entry objects (from the data server's index file) +###################################################################### + + +class Package: + """ + A directory entry for a downloadable package. These entries are + extracted from the XML index file that is downloaded by + ``Downloader``. Each package consists of a single file; but if + that file is a zip file, then it can be automatically decompressed + when the package is installed. + """ + + def __init__( + self, + id, + url, + name=None, + subdir="", + size=None, + unzipped_size=None, + checksum=None, + svn_revision=None, + copyright="Unknown", + contact="Unknown", + license="Unknown", + author="Unknown", + unzip=True, + **kw, + ): + self.id = id + """A unique identifier for this package.""" + + self.name = name or id + """A string name for this package.""" + + self.subdir = subdir + """The subdirectory where this package should be installed. + E.g., ``'corpora'`` or ``'taggers'``.""" + + self.url = url + """A URL that can be used to download this package's file.""" + + self.size = int(size) + """The filesize (in bytes) of the package file.""" + + self.unzipped_size = int(unzipped_size) + """The total filesize of the files contained in the package's + zipfile.""" + + self.checksum = checksum + """The MD-5 checksum of the package file.""" + + self.svn_revision = svn_revision + """A subversion revision number for this package.""" + + self.copyright = copyright + """Copyright holder for this package.""" + + self.contact = contact + """Name & email of the person who should be contacted with + questions about this package.""" + + self.license = license + """License information for this package.""" + + self.author = author + """Author of this package.""" + + ext = os.path.splitext(url.split("/")[-1])[1] + self.filename = os.path.join(subdir, id + ext) + """The filename that should be used for this package's file. It + is formed by joining ``self.subdir`` with ``self.id``, and + using the same extension as ``url``.""" + + self.unzip = bool(int(unzip)) # '0' or '1' + """A flag indicating whether this corpus should be unzipped by + default.""" + + # Include any other attributes provided by the XML file. + self.__dict__.update(kw) + + @staticmethod + def fromxml(xml): + if isinstance(xml, str): + xml = ElementTree.parse(xml) + for key in xml.attrib: + xml.attrib[key] = str(xml.attrib[key]) + return Package(**xml.attrib) + + def __lt__(self, other): + return self.id < other.id + + def __repr__(self): + return "" % self.id + + +class Collection: + """ + A directory entry for a collection of downloadable packages. + These entries are extracted from the XML index file that is + downloaded by ``Downloader``. + """ + + def __init__(self, id, children, name=None, **kw): + self.id = id + """A unique identifier for this collection.""" + + self.name = name or id + """A string name for this collection.""" + + self.children = children + """A list of the ``Collections`` or ``Packages`` directly + contained by this collection.""" + + self.packages = None + """A list of ``Packages`` contained by this collection or any + collections it recursively contains.""" + + # Include any other attributes provided by the XML file. + self.__dict__.update(kw) + + @staticmethod + def fromxml(xml): + if isinstance(xml, str): + xml = ElementTree.parse(xml) + for key in xml.attrib: + xml.attrib[key] = str(xml.attrib[key]) + children = [child.get("ref") for child in xml.findall("item")] + return Collection(children=children, **xml.attrib) + + def __lt__(self, other): + return self.id < other.id + + def __repr__(self): + return "" % self.id + + +###################################################################### +# Message Passing Objects +###################################################################### + + +class DownloaderMessage: + """A status message object, used by ``incr_download`` to + communicate its progress.""" + + +class StartCollectionMessage(DownloaderMessage): + """Data server has started working on a collection of packages.""" + + def __init__(self, collection): + self.collection = collection + + +class FinishCollectionMessage(DownloaderMessage): + """Data server has finished working on a collection of packages.""" + + def __init__(self, collection): + self.collection = collection + + +class StartPackageMessage(DownloaderMessage): + """Data server has started working on a package.""" + + def __init__(self, package): + self.package = package + + +class FinishPackageMessage(DownloaderMessage): + """Data server has finished working on a package.""" + + def __init__(self, package): + self.package = package + + +class StartDownloadMessage(DownloaderMessage): + """Data server has started downloading a package.""" + + def __init__(self, package): + self.package = package + + +class FinishDownloadMessage(DownloaderMessage): + """Data server has finished downloading a package.""" + + def __init__(self, package): + self.package = package + + +class StartUnzipMessage(DownloaderMessage): + """Data server has started unzipping a package.""" + + def __init__(self, package): + self.package = package + + +class FinishUnzipMessage(DownloaderMessage): + """Data server has finished unzipping a package.""" + + def __init__(self, package): + self.package = package + + +class UpToDateMessage(DownloaderMessage): + """The package download file is already up-to-date""" + + def __init__(self, package): + self.package = package + + +class StaleMessage(DownloaderMessage): + """The package download file is out-of-date or corrupt""" + + def __init__(self, package): + self.package = package + + +class ErrorMessage(DownloaderMessage): + """Data server encountered an error""" + + def __init__(self, package, message): + self.package = package + if isinstance(message, Exception): + self.message = str(message) + else: + self.message = message + + +class ProgressMessage(DownloaderMessage): + """Indicates how much progress the data server has made""" + + def __init__(self, progress): + self.progress = progress + + +class SelectDownloadDirMessage(DownloaderMessage): + """Indicates what download directory the data server is using""" + + def __init__(self, download_dir): + self.download_dir = download_dir + + +###################################################################### +# NLTK Data Server +###################################################################### + + +class Downloader: + """ + A class used to access the NLTK data server, which can be used to + download corpora and other data packages. + """ + + # ///////////////////////////////////////////////////////////////// + # Configuration + # ///////////////////////////////////////////////////////////////// + + INDEX_TIMEOUT = 60 * 60 # 1 hour + """The amount of time after which the cached copy of the data + server index will be considered 'stale,' and will be + re-downloaded.""" + + DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml" + """The default URL for the NLTK data server's index. An + alternative URL can be specified when creating a new + ``Downloader`` object.""" + + # ///////////////////////////////////////////////////////////////// + # Status Constants + # ///////////////////////////////////////////////////////////////// + + INSTALLED = "installed" + """A status string indicating that a package or collection is + installed and up-to-date.""" + NOT_INSTALLED = "not installed" + """A status string indicating that a package or collection is + not installed.""" + STALE = "out of date" + """A status string indicating that a package or collection is + corrupt or out-of-date.""" + PARTIAL = "partial" + """A status string indicating that a collection is partially + installed (i.e., only some of its packages are installed.)""" + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__(self, server_index_url=None, download_dir=None): + self._url = server_index_url or self.DEFAULT_URL + """The URL for the data server's index file.""" + + self._collections = {} + """Dictionary from collection identifier to ``Collection``""" + + self._packages = {} + """Dictionary from package identifier to ``Package``""" + + self._download_dir = download_dir + """The default directory to which packages will be downloaded.""" + + self._index = None + """The XML index file downloaded from the data server""" + + self._index_timestamp = None + """Time at which ``self._index`` was downloaded. If it is more + than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded.""" + + self._status_cache = {} + """Dictionary from package/collection identifier to status + string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or + ``PARTIAL``). Cache is used for packages only, not + collections.""" + + self._errors = None + """Flag for telling if all packages got successfully downloaded or not.""" + + # decide where we're going to save things to. + if self._download_dir is None: + self._download_dir = self.default_download_dir() + + # ///////////////////////////////////////////////////////////////// + # Information + # ///////////////////////////////////////////////////////////////// + + def list( + self, + download_dir=None, + show_packages=True, + show_collections=True, + header=True, + more_prompt=False, + skip_installed=False, + ): + lines = 0 # for more_prompt + if download_dir is None: + download_dir = self._download_dir + print("Using default data directory (%s)" % download_dir) + if header: + print("=" * (26 + len(self._url))) + print(" Data server index for <%s>" % self._url) + print("=" * (26 + len(self._url))) + lines += 3 # for more_prompt + stale = partial = False + + categories = [] + if show_packages: + categories.append("packages") + if show_collections: + categories.append("collections") + for category in categories: + print("%s:" % category.capitalize()) + lines += 1 # for more_prompt + for info in sorted(getattr(self, category)(), key=str): + status = self.status(info, download_dir) + if status == self.INSTALLED and skip_installed: + continue + if status == self.STALE: + stale = True + if status == self.PARTIAL: + partial = True + prefix = { + self.INSTALLED: "*", + self.STALE: "-", + self.PARTIAL: "P", + self.NOT_INSTALLED: " ", + }[status] + name = textwrap.fill( + "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " " + )[27:] + print(" [{}] {} {}".format(prefix, info.id.ljust(20, "."), name)) + lines += len(name.split("\n")) # for more_prompt + if more_prompt and lines > 20: + user_input = input("Hit Enter to continue: ") + if user_input.lower() in ("x", "q"): + return + lines = 0 + print() + msg = "([*] marks installed packages" + if stale: + msg += "; [-] marks out-of-date or corrupt packages" + if partial: + msg += "; [P] marks partially installed collections" + print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76)) + + def packages(self): + self._update_index() + return self._packages.values() + + def corpora(self): + self._update_index() + return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"] + + def models(self): + self._update_index() + return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"] + + def collections(self): + self._update_index() + return self._collections.values() + + # ///////////////////////////////////////////////////////////////// + # Downloading + # ///////////////////////////////////////////////////////////////// + + def _info_or_id(self, info_or_id): + if isinstance(info_or_id, str): + return self.info(info_or_id) + else: + return info_or_id + + # [xx] When during downloading is it 'safe' to abort? Only unsafe + # time is *during* an unzip -- we don't want to leave a + # partially-unzipped corpus in place because we wouldn't notice + # it. But if we had the exact total size of the unzipped corpus, + # then that would be fine. Then we could abort anytime we want! + # So this is really what we should do. That way the threaded + # downloader in the gui can just kill the download thread anytime + # it wants. + + def incr_download(self, info_or_id, download_dir=None, force=False): + # If they didn't specify a download_dir, then use the default one. + if download_dir is None: + download_dir = self._download_dir + yield SelectDownloadDirMessage(download_dir) + + # If they gave us a list of ids, then download each one. + if isinstance(info_or_id, (list, tuple)): + yield from self._download_list(info_or_id, download_dir, force) + return + + # Look up the requested collection or package. + try: + info = self._info_or_id(info_or_id) + except (OSError, ValueError) as e: + yield ErrorMessage(None, f"Error loading {info_or_id}: {e}") + return + + # Handle collections. + if isinstance(info, Collection): + yield StartCollectionMessage(info) + yield from self.incr_download(info.children, download_dir, force) + yield FinishCollectionMessage(info) + + # Handle Packages (delegate to a helper function). + else: + yield from self._download_package(info, download_dir, force) + + def _num_packages(self, item): + if isinstance(item, Package): + return 1 + else: + return len(item.packages) + + def _download_list(self, items, download_dir, force): + # Look up the requested items. + for i in range(len(items)): + try: + items[i] = self._info_or_id(items[i]) + except (OSError, ValueError) as e: + yield ErrorMessage(items[i], e) + return + + # Download each item, re-scaling their progress. + num_packages = sum(self._num_packages(item) for item in items) + progress = 0 + for i, item in enumerate(items): + if isinstance(item, Package): + delta = 1.0 / num_packages + else: + delta = len(item.packages) / num_packages + for msg in self.incr_download(item, download_dir, force): + if isinstance(msg, ProgressMessage): + yield ProgressMessage(progress + msg.progress * delta) + else: + yield msg + + progress += 100 * delta + + def _download_package(self, info, download_dir, force): + yield StartPackageMessage(info) + yield ProgressMessage(0) + + # Do we already have the current version? + status = self.status(info, download_dir) + if not force and status == self.INSTALLED: + yield UpToDateMessage(info) + yield ProgressMessage(100) + yield FinishPackageMessage(info) + return + + # Remove the package from our status cache + self._status_cache.pop(info.id, None) + + # Check for (and remove) any old/stale version. + filepath = os.path.join(download_dir, info.filename) + if os.path.exists(filepath): + if status == self.STALE: + yield StaleMessage(info) + os.remove(filepath) + + # Ensure the download_dir exists + if not os.path.exists(download_dir): + os.makedirs(download_dir) + if not os.path.exists(os.path.join(download_dir, info.subdir)): + os.makedirs(os.path.join(download_dir, info.subdir)) + + # Download the file. This will raise an IOError if the url + # is not found. + yield StartDownloadMessage(info) + yield ProgressMessage(5) + try: + infile = urlopen(info.url) + with open(filepath, "wb") as outfile: + num_blocks = max(1, info.size / (1024 * 16)) + for block in itertools.count(): + s = infile.read(1024 * 16) # 16k blocks. + outfile.write(s) + if not s: + break + if block % 2 == 0: # how often? + yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks))) + infile.close() + except OSError as e: + yield ErrorMessage( + info, + "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e), + ) + return + yield FinishDownloadMessage(info) + yield ProgressMessage(80) + + # If it's a zipfile, uncompress it. + if info.filename.endswith(".zip"): + zipdir = os.path.join(download_dir, info.subdir) + # Unzip if we're unzipping by default; *or* if it's already + # been unzipped (presumably a previous version). + if info.unzip or os.path.exists(os.path.join(zipdir, info.id)): + yield StartUnzipMessage(info) + for msg in _unzip_iter(filepath, zipdir, verbose=False): + # Somewhat of a hack, but we need a proper package reference + msg.package = info + yield msg + yield FinishUnzipMessage(info) + + yield FinishPackageMessage(info) + + def download( + self, + info_or_id=None, + download_dir=None, + quiet=False, + force=False, + prefix="[nltk_data] ", + halt_on_error=True, + raise_on_error=False, + print_error_to=sys.stderr, + ): + + print_to = functools.partial(print, file=print_error_to) + # If no info or id is given, then use the interactive shell. + if info_or_id is None: + # [xx] hmm -- changing self._download_dir here seems like + # the wrong thing to do. Maybe the _interactive_download + # function should make a new copy of self to use? + if download_dir is not None: + self._download_dir = download_dir + self._interactive_download() + return True + + else: + # Define a helper function for displaying output: + def show(s, prefix2=""): + print_to( + textwrap.fill( + s, + initial_indent=prefix + prefix2, + subsequent_indent=prefix + prefix2 + " " * 4, + ) + ) + + for msg in self.incr_download(info_or_id, download_dir, force): + # Error messages + if isinstance(msg, ErrorMessage): + show(msg.message) + if raise_on_error: + raise ValueError(msg.message) + if halt_on_error: + return False + self._errors = True + if not quiet: + print_to("Error installing package. Retry? [n/y/e]") + choice = input().strip() + if choice in ["y", "Y"]: + if not self.download( + msg.package.id, + download_dir, + quiet, + force, + prefix, + halt_on_error, + raise_on_error, + ): + return False + elif choice in ["e", "E"]: + return False + + # All other messages + if not quiet: + # Collection downloading messages: + if isinstance(msg, StartCollectionMessage): + show("Downloading collection %r" % msg.collection.id) + prefix += " | " + print_to(prefix) + elif isinstance(msg, FinishCollectionMessage): + print_to(prefix) + prefix = prefix[:-4] + if self._errors: + show( + "Downloaded collection %r with errors" + % msg.collection.id + ) + else: + show("Done downloading collection %s" % msg.collection.id) + + # Package downloading messages: + elif isinstance(msg, StartPackageMessage): + show( + "Downloading package %s to %s..." + % (msg.package.id, download_dir) + ) + elif isinstance(msg, UpToDateMessage): + show("Package %s is already up-to-date!" % msg.package.id, " ") + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt' % + # msg.package.id, ' ') + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s." % msg.package.filename, " ") + + # Data directory message: + elif isinstance(msg, SelectDownloadDirMessage): + download_dir = msg.download_dir + return True + + def is_stale(self, info_or_id, download_dir=None): + return self.status(info_or_id, download_dir) == self.STALE + + def is_installed(self, info_or_id, download_dir=None): + return self.status(info_or_id, download_dir) == self.INSTALLED + + def clear_status_cache(self, id=None): + if id is None: + self._status_cache.clear() + else: + self._status_cache.pop(id, None) + + def status(self, info_or_id, download_dir=None): + """ + Return a constant describing the status of the given package + or collection. Status can be one of ``INSTALLED``, + ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``. + """ + if download_dir is None: + download_dir = self._download_dir + info = self._info_or_id(info_or_id) + + # Handle collections: + if isinstance(info, Collection): + pkg_status = [self.status(pkg.id) for pkg in info.packages] + if self.STALE in pkg_status: + return self.STALE + elif self.PARTIAL in pkg_status: + return self.PARTIAL + elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status: + return self.PARTIAL + elif self.NOT_INSTALLED in pkg_status: + return self.NOT_INSTALLED + else: + return self.INSTALLED + + # Handle packages: + else: + filepath = os.path.join(download_dir, info.filename) + if download_dir != self._download_dir: + return self._pkg_status(info, filepath) + else: + if info.id not in self._status_cache: + self._status_cache[info.id] = self._pkg_status(info, filepath) + return self._status_cache[info.id] + + def _pkg_status(self, info, filepath): + if not os.path.exists(filepath): + return self.NOT_INSTALLED + + # Check if the file has the correct size. + try: + filestat = os.stat(filepath) + except OSError: + return self.NOT_INSTALLED + if filestat.st_size != int(info.size): + return self.STALE + + # Check if the file's checksum matches + if md5_hexdigest(filepath) != info.checksum: + return self.STALE + + # If it's a zipfile, and it's been at least partially + # unzipped, then check if it's been fully unzipped. + if filepath.endswith(".zip"): + unzipdir = filepath[:-4] + if not os.path.exists(unzipdir): + return self.INSTALLED # but not unzipped -- ok! + if not os.path.isdir(unzipdir): + return self.STALE + + unzipped_size = sum( + os.stat(os.path.join(d, f)).st_size + for d, _, files in os.walk(unzipdir) + for f in files + ) + if unzipped_size != info.unzipped_size: + return self.STALE + + # Otherwise, everything looks good. + return self.INSTALLED + + def update(self, quiet=False, prefix="[nltk_data] "): + """ + Re-download any packages whose status is STALE. + """ + self.clear_status_cache() + for pkg in self.packages(): + if self.status(pkg) == self.STALE: + self.download(pkg, quiet=quiet, prefix=prefix) + + # ///////////////////////////////////////////////////////////////// + # Index + # ///////////////////////////////////////////////////////////////// + + def _update_index(self, url=None): + """A helper function that ensures that self._index is + up-to-date. If the index is older than self.INDEX_TIMEOUT, + then download it again.""" + # Check if the index is already up-to-date. If so, do nothing. + if not ( + self._index is None + or url is not None + or time.time() - self._index_timestamp > self.INDEX_TIMEOUT + ): + return + + # If a URL was specified, then update our URL. + self._url = url or self._url + + # Download the index file. + self._index = nltk.internals.ElementWrapper( + ElementTree.parse(urlopen(self._url)).getroot() + ) + self._index_timestamp = time.time() + + # Build a dictionary of packages. + packages = [Package.fromxml(p) for p in self._index.findall("packages/package")] + self._packages = {p.id: p for p in packages} + + # Build a dictionary of collections. + collections = [ + Collection.fromxml(c) for c in self._index.findall("collections/collection") + ] + self._collections = {c.id: c for c in collections} + + # Replace identifiers with actual children in collection.children. + for collection in self._collections.values(): + for i, child_id in enumerate(collection.children): + if child_id in self._packages: + collection.children[i] = self._packages[child_id] + elif child_id in self._collections: + collection.children[i] = self._collections[child_id] + else: + print( + "removing collection member with no package: {}".format( + child_id + ) + ) + del collection.children[i] + + # Fill in collection.packages for each collection. + for collection in self._collections.values(): + packages = {} + queue = [collection] + for child in queue: + if isinstance(child, Collection): + queue.extend(child.children) + elif isinstance(child, Package): + packages[child.id] = child + else: + pass + collection.packages = packages.values() + + # Flush the status cache + self._status_cache.clear() + + def index(self): + """ + Return the XML index describing the packages available from + the data server. If necessary, this index will be downloaded + from the data server. + """ + self._update_index() + return self._index + + def info(self, id): + """Return the ``Package`` or ``Collection`` record for the + given item.""" + self._update_index() + if id in self._packages: + return self._packages[id] + if id in self._collections: + return self._collections[id] + raise ValueError("Package %r not found in index" % id) + + def xmlinfo(self, id): + """Return the XML info record for the given item""" + self._update_index() + for package in self._index.findall("packages/package"): + if package.get("id") == id: + return package + for collection in self._index.findall("collections/collection"): + if collection.get("id") == id: + return collection + raise ValueError("Package %r not found in index" % id) + + # ///////////////////////////////////////////////////////////////// + # URL & Data Directory + # ///////////////////////////////////////////////////////////////// + + def _get_url(self): + """The URL for the data server's index file.""" + return self._url + + def _set_url(self, url): + """ + Set a new URL for the data server. If we're unable to contact + the given url, then the original url is kept. + """ + original_url = self._url + try: + self._update_index(url) + except: + self._url = original_url + raise + + url = property(_get_url, _set_url) + + def default_download_dir(self): + """ + Return the directory to which packages will be downloaded by + default. This value can be overridden using the constructor, + or on a case-by-case basis using the ``download_dir`` argument when + calling ``download()``. + + On Windows, the default download directory is + ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the + directory containing Python, e.g. ``C:\\Python25``. + + On all other platforms, the default directory is the first of + the following which exists or which can be created with write + permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, + ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. + """ + # Check if we are on GAE where we cannot write into filesystem. + if "APPENGINE_RUNTIME" in os.environ: + return + + # Check if we have sufficient permissions to install in a + # variety of system-wide locations. + for nltkdir in nltk.data.path: + if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir): + return nltkdir + + # On Windows, use %APPDATA% + if sys.platform == "win32" and "APPDATA" in os.environ: + homedir = os.environ["APPDATA"] + + # Otherwise, install in the user's home directory. + else: + homedir = os.path.expanduser("~/") + if homedir == "~/": + raise ValueError("Could not find a default download directory") + + # append "nltk_data" to the home directory + return os.path.join(homedir, "nltk_data") + + def _get_download_dir(self): + """ + The default directory to which packages will be downloaded. + This defaults to the value returned by ``default_download_dir()``. + To override this default on a case-by-case basis, use the + ``download_dir`` argument when calling ``download()``. + """ + return self._download_dir + + def _set_download_dir(self, download_dir): + self._download_dir = download_dir + # Clear the status cache. + self._status_cache.clear() + + download_dir = property(_get_download_dir, _set_download_dir) + + # ///////////////////////////////////////////////////////////////// + # Interactive Shell + # ///////////////////////////////////////////////////////////////// + + def _interactive_download(self): + # Try the GUI first; if that doesn't work, try the simple + # interactive shell. + if TKINTER: + try: + DownloaderGUI(self).mainloop() + except TclError: + DownloaderShell(self).run() + else: + DownloaderShell(self).run() + + +class DownloaderShell: + def __init__(self, dataserver): + self._ds = dataserver + + def _simple_interactive_menu(self, *options): + print("-" * 75) + spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " " + print(" " + spc.join(options)) + print("-" * 75) + + def run(self): + print("NLTK Downloader") + while True: + self._simple_interactive_menu( + "d) Download", + "l) List", + " u) Update", + "c) Config", + "h) Help", + "q) Quit", + ) + user_input = input("Downloader> ").strip() + if not user_input: + print() + continue + command = user_input.lower().split()[0] + args = user_input.split()[1:] + try: + if command == "l": + print() + self._ds.list(self._ds.download_dir, header=False, more_prompt=True) + elif command == "h": + self._simple_interactive_help() + elif command == "c": + self._simple_interactive_config() + elif command in ("q", "x"): + return + elif command == "d": + self._simple_interactive_download(args) + elif command == "u": + self._simple_interactive_update() + else: + print("Command %r unrecognized" % user_input) + except HTTPError as e: + print("Error reading from server: %s" % e) + except URLError as e: + print("Error connecting to server: %s" % e.reason) + # try checking if user_input is a package name, & + # downloading it? + print() + + def _simple_interactive_download(self, args): + if args: + for arg in args: + try: + self._ds.download(arg, prefix=" ") + except (OSError, ValueError) as e: + print(e) + else: + while True: + print() + print("Download which package (l=list; x=cancel)?") + user_input = input(" Identifier> ") + if user_input.lower() == "l": + self._ds.list( + self._ds.download_dir, + header=False, + more_prompt=True, + skip_installed=True, + ) + continue + elif user_input.lower() in ("x", "q", ""): + return + elif user_input: + for id in user_input.split(): + try: + self._ds.download(id, prefix=" ") + except (OSError, ValueError) as e: + print(e) + break + + def _simple_interactive_update(self): + while True: + stale_packages = [] + stale = partial = False + for info in sorted(getattr(self._ds, "packages")(), key=str): + if self._ds.status(info) == self._ds.STALE: + stale_packages.append((info.id, info.name)) + + print() + if stale_packages: + print("Will update following packages (o=ok; x=cancel)") + for pid, pname in stale_packages: + name = textwrap.fill( + "-" * 27 + (pname), 75, subsequent_indent=27 * " " + )[27:] + print(" [ ] {} {}".format(pid.ljust(20, "."), name)) + print() + + user_input = input(" Identifier> ") + if user_input.lower() == "o": + for pid, pname in stale_packages: + try: + self._ds.download(pid, prefix=" ") + except (OSError, ValueError) as e: + print(e) + break + elif user_input.lower() in ("x", "q", ""): + return + else: + print("Nothing to update.") + return + + def _simple_interactive_help(self): + print() + print("Commands:") + print( + " d) Download a package or collection u) Update out of date packages" + ) + print(" l) List packages & collections h) Help") + print(" c) View & Modify Configuration q) Quit") + + def _show_config(self): + print() + print("Data Server:") + print(" - URL: <%s>" % self._ds.url) + print(" - %d Package Collections Available" % len(self._ds.collections())) + print(" - %d Individual Packages Available" % len(self._ds.packages())) + print() + print("Local Machine:") + print(" - Data directory: %s" % self._ds.download_dir) + + def _simple_interactive_config(self): + self._show_config() + while True: + print() + self._simple_interactive_menu( + "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu" + ) + user_input = input("Config> ").strip().lower() + if user_input == "s": + self._show_config() + elif user_input == "d": + new_dl_dir = input(" New Directory> ").strip() + if new_dl_dir in ("", "x", "q", "X", "Q"): + print(" Cancelled!") + elif os.path.isdir(new_dl_dir): + self._ds.download_dir = new_dl_dir + else: + print("Directory %r not found! Create it first." % new_dl_dir) + elif user_input == "u": + new_url = input(" New URL> ").strip() + if new_url in ("", "x", "q", "X", "Q"): + print(" Cancelled!") + else: + if not new_url.startswith(("http://", "https://")): + new_url = "http://" + new_url + try: + self._ds.url = new_url + except Exception as e: + print(f"Error reading <{new_url!r}>:\n {e}") + elif user_input == "m": + break + + +class DownloaderGUI: + """ + Graphical interface for downloading packages from the NLTK data + server. + """ + + # ///////////////////////////////////////////////////////////////// + # Column Configuration + # ///////////////////////////////////////////////////////////////// + + COLUMNS = [ + "", + "Identifier", + "Name", + "Size", + "Status", + "Unzipped Size", + "Copyright", + "Contact", + "License", + "Author", + "Subdir", + "Checksum", + ] + """A list of the names of columns. This controls the order in + which the columns will appear. If this is edited, then + ``_package_to_columns()`` may need to be edited to match.""" + + COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0} + """A dictionary specifying how columns should be resized when the + table is resized. Columns with weight 0 will not be resized at + all; and columns with high weight will be resized more. + Default weight (for columns not explicitly listed) is 1.""" + + COLUMN_WIDTHS = { + "": 1, + "Identifier": 20, + "Name": 45, + "Size": 10, + "Unzipped Size": 10, + "Status": 12, + } + """A dictionary specifying how wide each column should be, in + characters. The default width (for columns not explicitly + listed) is specified by ``DEFAULT_COLUMN_WIDTH``.""" + + DEFAULT_COLUMN_WIDTH = 30 + """The default width for columns that are not explicitly listed + in ``COLUMN_WIDTHS``.""" + + INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"] + """The set of columns that should be displayed by default.""" + + # Perform a few import-time sanity checks to make sure that the + # column configuration variables are defined consistently: + for c in COLUMN_WEIGHTS: + assert c in COLUMNS + for c in COLUMN_WIDTHS: + assert c in COLUMNS + for c in INITIAL_COLUMNS: + assert c in COLUMNS + + # ///////////////////////////////////////////////////////////////// + # Color Configuration + # ///////////////////////////////////////////////////////////////// + + _BACKDROP_COLOR = ("#000", "#ccc") + + _ROW_COLOR = { + Downloader.INSTALLED: ("#afa", "#080"), + Downloader.PARTIAL: ("#ffa", "#880"), + Downloader.STALE: ("#faa", "#800"), + Downloader.NOT_INSTALLED: ("#fff", "#888"), + } + + _MARK_COLOR = ("#000", "#ccc") + + # _FRONT_TAB_COLOR = ('#ccf', '#008') + # _BACK_TAB_COLOR = ('#88a', '#448') + _FRONT_TAB_COLOR = ("#fff", "#45c") + _BACK_TAB_COLOR = ("#aaa", "#67a") + + _PROGRESS_COLOR = ("#f00", "#aaa") + + _TAB_FONT = "helvetica -16 bold" + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__(self, dataserver, use_threads=True): + self._ds = dataserver + self._use_threads = use_threads + + # For the threaded downloader: + self._download_lock = threading.Lock() + self._download_msg_queue = [] + self._download_abort_queue = [] + self._downloading = False + + # For tkinter after callbacks: + self._afterid = {} + + # A message log. + self._log_messages = [] + self._log_indent = 0 + self._log("NLTK Downloader Started!") + + # Create the main window. + top = self.top = Tk() + top.geometry("+50+50") + top.title("NLTK Downloader") + top.configure(background=self._BACKDROP_COLOR[1]) + + # Set up some bindings now, in case anything goes wrong. + top.bind("", self.destroy) + top.bind("", self.destroy) + self._destroyed = False + + self._column_vars = {} + + # Initialize the GUI. + self._init_widgets() + self._init_menu() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + self._show_info() + self._select_columns() + self._table.select(0) + + # Make sure we get notified when we're destroyed, so we can + # cancel any download in progress. + self._table.bind("", self._destroy) + + def _log(self, msg): + self._log_messages.append( + "{} {}{}".format(time.ctime(), " | " * self._log_indent, msg) + ) + + # ///////////////////////////////////////////////////////////////// + # Internals + # ///////////////////////////////////////////////////////////////// + + def _init_widgets(self): + # Create the top-level frame structures + f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0) + f1.pack(sid="top", expand=True, fill="both") + f1.grid_rowconfigure(2, weight=1) + f1.grid_columnconfigure(0, weight=1) + Frame(f1, height=8).grid(column=0, row=0) # spacer + tabframe = Frame(f1) + tabframe.grid(column=0, row=1, sticky="news") + tableframe = Frame(f1) + tableframe.grid(column=0, row=2, sticky="news") + buttonframe = Frame(f1) + buttonframe.grid(column=0, row=3, sticky="news") + Frame(f1, height=8).grid(column=0, row=4) # spacer + infoframe = Frame(f1) + infoframe.grid(column=0, row=5, sticky="news") + Frame(f1, height=8).grid(column=0, row=6) # spacer + progressframe = Frame( + self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1] + ) + progressframe.pack(side="bottom", fill="x") + self.top["border"] = 0 + self.top["highlightthickness"] = 0 + + # Create the tabs + self._tab_names = ["Collections", "Corpora", "Models", "All Packages"] + self._tabs = {} + for i, tab in enumerate(self._tab_names): + label = Label(tabframe, text=tab, font=self._TAB_FONT) + label.pack(side="left", padx=((i + 1) % 2) * 10) + label.bind("", self._select_tab) + self._tabs[tab.lower()] = label + + # Create the table. + column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS] + self._table = Table( + tableframe, + self.COLUMNS, + column_weights=column_weights, + highlightthickness=0, + listbox_height=16, + reprfunc=self._table_reprfunc, + ) + self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked + for i, column in enumerate(self.COLUMNS): + width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH) + self._table.columnconfig(i, width=width) + self._table.pack(expand=True, fill="both") + self._table.focus() + self._table.bind_to_listboxes("", self._download) + self._table.bind("", self._table_mark) + self._table.bind("", self._download) + self._table.bind("", self._prev_tab) + self._table.bind("", self._next_tab) + self._table.bind("", self._mark_all) + + # Create entry boxes for URL & download_dir + infoframe.grid_columnconfigure(1, weight=1) + + info = [ + ("url", "Server Index:", self._set_url), + ("download_dir", "Download Directory:", self._set_download_dir), + ] + self._info = {} + for (i, (key, label, callback)) in enumerate(info): + Label(infoframe, text=label).grid(column=0, row=i, sticky="e") + entry = Entry( + infoframe, + font="courier", + relief="groove", + disabledforeground="#007aff", + foreground="#007aff", + ) + self._info[key] = (entry, callback) + entry.bind("", self._info_save) + entry.bind("", lambda e, key=key: self._info_edit(key)) + entry.grid(column=1, row=i, sticky="ew") + + # If the user edits url or download_dir, and then clicks outside + # the entry box, then save their results. + self.top.bind("", self._info_save) + + # Create Download & Refresh buttons. + self._download_button = Button( + buttonframe, text="Download", command=self._download, width=8 + ) + self._download_button.pack(side="left") + self._refresh_button = Button( + buttonframe, text="Refresh", command=self._refresh, width=8 + ) + self._refresh_button.pack(side="right") + + # Create Progress bar + self._progresslabel = Label( + progressframe, + text="", + foreground=self._BACKDROP_COLOR[0], + background=self._BACKDROP_COLOR[1], + ) + self._progressbar = Canvas( + progressframe, + width=200, + height=16, + background=self._PROGRESS_COLOR[1], + relief="sunken", + border=1, + ) + self._init_progressbar() + self._progressbar.pack(side="right") + self._progresslabel.pack(side="left") + + def _init_menu(self): + menubar = Menu(self.top) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Download", underline=0, command=self._download, accelerator="Return" + ) + filemenu.add_separator() + filemenu.add_command( + label="Change Server Index", + underline=7, + command=lambda: self._info_edit("url"), + ) + filemenu.add_command( + label="Change Download Directory", + underline=0, + command=lambda: self._info_edit("download_dir"), + ) + filemenu.add_separator() + filemenu.add_command(label="Show Log", underline=5, command=self._show_log) + filemenu.add_separator() + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + # Create a menu to control which columns of the table are + # shown. n.b.: we never hide the first two columns (mark and + # identifier). + viewmenu = Menu(menubar, tearoff=0) + for column in self._table.column_names[2:]: + var = IntVar(self.top) + assert column not in self._column_vars + self._column_vars[column] = var + if column in self.INITIAL_COLUMNS: + var.set(1) + viewmenu.add_checkbutton( + label=column, underline=0, variable=var, command=self._select_columns + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + # Create a sort menu + # [xx] this should be selectbuttons; and it should include + # reversed sorts as options. + sortmenu = Menu(menubar, tearoff=0) + for column in self._table.column_names[1:]: + sortmenu.add_command( + label="Sort by %s" % column, + command=(lambda c=column: self._table.sort_by(c, "ascending")), + ) + sortmenu.add_separator() + # sortmenu.add_command(label='Descending Sort:') + for column in self._table.column_names[1:]: + sortmenu.add_command( + label="Reverse sort by %s" % column, + command=(lambda c=column: self._table.sort_by(c, "descending")), + ) + menubar.add_cascade(label="Sort", underline=0, menu=sortmenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + helpmenu.add_command( + label="Instructions", underline=0, command=self.help, accelerator="F1" + ) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + self.top.bind("", self.help) + + self.top.config(menu=menubar) + + def _select_columns(self): + for (column, var) in self._column_vars.items(): + if var.get(): + self._table.show_column(column) + else: + self._table.hide_column(column) + + def _refresh(self): + self._ds.clear_status_cache() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + self._table.select(0) + + def _info_edit(self, info_key): + self._info_save() # just in case. + (entry, callback) = self._info[info_key] + entry["state"] = "normal" + entry["relief"] = "sunken" + entry.focus() + + def _info_save(self, e=None): + focus = self._table + for entry, callback in self._info.values(): + if entry["state"] == "disabled": + continue + if e is not None and e.widget is entry and e.keysym != "Return": + focus = entry + else: + entry["state"] = "disabled" + entry["relief"] = "groove" + callback(entry.get()) + focus.focus() + + def _table_reprfunc(self, row, col, val): + if self._table.column_names[col].endswith("Size"): + if isinstance(val, str): + return " %s" % val + elif val < 1024**2: + return " %.1f KB" % (val / 1024.0**1) + elif val < 1024**3: + return " %.1f MB" % (val / 1024.0**2) + else: + return " %.1f GB" % (val / 1024.0**3) + + if col in (0, ""): + return str(val) + else: + return " %s" % val + + def _set_url(self, url): + if url == self._ds.url: + return + try: + self._ds.url = url + self._fill_table() + except OSError as e: + showerror("Error Setting Server Index", str(e)) + self._show_info() + + def _set_download_dir(self, download_dir): + if self._ds.download_dir == download_dir: + return + # check if the dir exists, and if not, ask if we should create it? + + # Clear our status cache, & re-check what's installed + self._ds.download_dir = download_dir + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + self._show_info() + + def _show_info(self): + print("showing info", self._ds.url) + for entry, cb in self._info.values(): + entry["state"] = "normal" + entry.delete(0, "end") + self._info["url"][0].insert(0, self._ds.url) + self._info["download_dir"][0].insert(0, self._ds.download_dir) + for entry, cb in self._info.values(): + entry["state"] = "disabled" + + def _prev_tab(self, *e): + for i, tab in enumerate(self._tab_names): + if tab.lower() == self._tab and i > 0: + self._tab = self._tab_names[i - 1].lower() + try: + return self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + def _next_tab(self, *e): + for i, tab in enumerate(self._tab_names): + if tab.lower() == self._tab and i < (len(self._tabs) - 1): + self._tab = self._tab_names[i + 1].lower() + try: + return self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + def _select_tab(self, event): + self._tab = event.widget["text"].lower() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + _tab = "collections" + # _tab = 'corpora' + _rows = None + + def _fill_table(self): + selected_row = self._table.selected_row() + self._table.clear() + if self._tab == "all packages": + items = self._ds.packages() + elif self._tab == "corpora": + items = self._ds.corpora() + elif self._tab == "models": + items = self._ds.models() + elif self._tab == "collections": + items = self._ds.collections() + else: + assert 0, "bad tab value %r" % self._tab + rows = [self._package_to_columns(item) for item in items] + self._table.extend(rows) + + # Highlight the active tab. + for tab, label in self._tabs.items(): + if tab == self._tab: + label.configure( + foreground=self._FRONT_TAB_COLOR[0], + background=self._FRONT_TAB_COLOR[1], + ) + else: + label.configure( + foreground=self._BACK_TAB_COLOR[0], + background=self._BACK_TAB_COLOR[1], + ) + + self._table.sort_by("Identifier", order="ascending") + self._color_table() + self._table.select(selected_row) + + # This is a hack, because the scrollbar isn't updating its + # position right -- I'm not sure what the underlying cause is + # though. (This is on OS X w/ python 2.5) The length of + # delay that's necessary seems to depend on how fast the + # comptuer is. :-/ + self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview()) + self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview()) + + def _update_table_status(self): + for row_num in range(len(self._table)): + status = self._ds.status(self._table[row_num, "Identifier"]) + self._table[row_num, "Status"] = status + self._color_table() + + def _download(self, *e): + # If we're using threads, then delegate to the threaded + # downloader instead. + if self._use_threads: + return self._download_threaded(*e) + + marked = [ + self._table[row, "Identifier"] + for row in range(len(self._table)) + if self._table[row, 0] != "" + ] + selection = self._table.selected_row() + if not marked and selection is not None: + marked = [self._table[selection, "Identifier"]] + + download_iter = self._ds.incr_download(marked, self._ds.download_dir) + self._log_indent = 0 + self._download_cb(download_iter, marked) + + _DL_DELAY = 10 + + def _download_cb(self, download_iter, ids): + try: + msg = next(download_iter) + except StopIteration: + # self._fill_table(sort=False) + self._update_table_status() + afterid = self.top.after(10, self._show_progress, 0) + self._afterid["_download_cb"] = afterid + return + + def show(s): + self._progresslabel["text"] = s + self._log(s) + + if isinstance(msg, ProgressMessage): + self._show_progress(msg.progress) + elif isinstance(msg, ErrorMessage): + show(msg.message) + if msg.package is not None: + self._select(msg.package.id) + self._show_progress(None) + return # halt progress. + elif isinstance(msg, StartCollectionMessage): + show("Downloading collection %s" % msg.collection.id) + self._log_indent += 1 + elif isinstance(msg, StartPackageMessage): + show("Downloading package %s" % msg.package.id) + elif isinstance(msg, UpToDateMessage): + show("Package %s is up-to-date!" % msg.package.id) + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt' % msg.package.id) + elif isinstance(msg, FinishDownloadMessage): + show("Finished downloading %r." % msg.package.id) + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s" % msg.package.filename) + elif isinstance(msg, FinishCollectionMessage): + self._log_indent -= 1 + show("Finished downloading collection %r." % msg.collection.id) + self._clear_mark(msg.collection.id) + elif isinstance(msg, FinishPackageMessage): + self._clear_mark(msg.package.id) + afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids) + self._afterid["_download_cb"] = afterid + + def _select(self, id): + for row in range(len(self._table)): + if self._table[row, "Identifier"] == id: + self._table.select(row) + return + + def _color_table(self): + # Color rows according to status. + for row in range(len(self._table)): + bg, sbg = self._ROW_COLOR[self._table[row, "Status"]] + fg, sfg = ("black", "white") + self._table.rowconfig( + row, + foreground=fg, + selectforeground=sfg, + background=bg, + selectbackground=sbg, + ) + # Color the marked column + self._table.itemconfigure( + row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1] + ) + + def _clear_mark(self, id): + for row in range(len(self._table)): + if self._table[row, "Identifier"] == id: + self._table[row, 0] = "" + + def _mark_all(self, *e): + for row in range(len(self._table)): + self._table[row, 0] = "X" + + def _table_mark(self, *e): + selection = self._table.selected_row() + if selection >= 0: + if self._table[selection][0] != "": + self._table[selection, 0] = "" + else: + self._table[selection, 0] = "X" + self._table.select(delta=1) + + def _show_log(self): + text = "\n".join(self._log_messages) + ShowText(self.top, "NLTK Downloader Log", text) + + def _package_to_columns(self, pkg): + """ + Given a package, return a list of values describing that + package, one for each column in ``self.COLUMNS``. + """ + row = [] + for column_index, column_name in enumerate(self.COLUMNS): + if column_index == 0: # Mark: + row.append("") + elif column_name == "Identifier": + row.append(pkg.id) + elif column_name == "Status": + row.append(self._ds.status(pkg)) + else: + attr = column_name.lower().replace(" ", "_") + row.append(getattr(pkg, attr, "n/a")) + return row + + # ///////////////////////////////////////////////////////////////// + # External Interface + # ///////////////////////////////////////////////////////////////// + + def destroy(self, *e): + if self._destroyed: + return + self.top.destroy() + self._destroyed = True + + def _destroy(self, *e): + if self.top is not None: + for afterid in self._afterid.values(): + self.top.after_cancel(afterid) + + # Abort any download in progress. + if self._downloading and self._use_threads: + self._abort_download() + + # Make sure the garbage collector destroys these now; + # otherwise, they may get destroyed when we're not in the main + # thread, which would make Tkinter unhappy. + self._column_vars.clear() + + def mainloop(self, *args, **kwargs): + self.top.mainloop(*args, **kwargs) + + # ///////////////////////////////////////////////////////////////// + # HELP + # ///////////////////////////////////////////////////////////////// + + HELP = textwrap.dedent( + """\ + This tool can be used to download a variety of corpora and models + that can be used with NLTK. Each corpus or model is distributed + in a single zip file, known as a \"package file.\" You can + download packages individually, or you can download pre-defined + collections of packages. + + When you download a package, it will be saved to the \"download + directory.\" A default download directory is chosen when you run + + the downloader; but you may also select a different download + directory. On Windows, the default download directory is + + + \"package.\" + + The NLTK downloader can be used to download a variety of corpora, + models, and other data packages. + + Keyboard shortcuts:: + [return]\t Download + [up]\t Select previous package + [down]\t Select next package + [left]\t Select previous tab + [right]\t Select next tab + """ + ) + + def help(self, *e): + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self.top, + "Help: NLTK Downloader", + self.HELP.strip(), + width=75, + font="fixed", + ) + except: + ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75) + + def about(self, *e): + ABOUT = "NLTK Downloader\n" + "Written by Edward Loper" + TITLE = "About: NLTK Downloader" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except ImportError: + ShowText(self.top, TITLE, ABOUT) + + # ///////////////////////////////////////////////////////////////// + # Progress Bar + # ///////////////////////////////////////////////////////////////// + + _gradient_width = 5 + + def _init_progressbar(self): + c = self._progressbar + width, height = int(c["width"]), int(c["height"]) + for i in range(0, (int(c["width"]) * 2) // self._gradient_width): + c.create_line( + i * self._gradient_width + 20, + -20, + i * self._gradient_width - height - 20, + height + 20, + width=self._gradient_width, + fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12), + ) + c.addtag_all("gradient") + c.itemconfig("gradient", state="hidden") + + # This is used to display progress + c.addtag_withtag( + "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]) + ) + + def _show_progress(self, percent): + c = self._progressbar + if percent is None: + c.coords("redbox", 0, 0, 0, 0) + c.itemconfig("gradient", state="hidden") + else: + width, height = int(c["width"]), int(c["height"]) + x = percent * int(width) // 100 + 1 + c.coords("redbox", 0, 0, x, height + 1) + + def _progress_alive(self): + c = self._progressbar + if not self._downloading: + c.itemconfig("gradient", state="hidden") + else: + c.itemconfig("gradient", state="normal") + x1, y1, x2, y2 = c.bbox("gradient") + if x1 <= -100: + c.move("gradient", (self._gradient_width * 6) - 4, 0) + else: + c.move("gradient", -4, 0) + afterid = self.top.after(200, self._progress_alive) + self._afterid["_progress_alive"] = afterid + + # ///////////////////////////////////////////////////////////////// + # Threaded downloader + # ///////////////////////////////////////////////////////////////// + + def _download_threaded(self, *e): + # If the user tries to start a new download while we're already + # downloading something, then abort the current download instead. + if self._downloading: + self._abort_download() + return + + # Change the 'download' button to an 'abort' button. + self._download_button["text"] = "Cancel" + + marked = [ + self._table[row, "Identifier"] + for row in range(len(self._table)) + if self._table[row, 0] != "" + ] + selection = self._table.selected_row() + if not marked and selection is not None: + marked = [self._table[selection, "Identifier"]] + + # Create a new data server object for the download operation, + # just in case the user modifies our data server during the + # download (e.g., clicking 'refresh' or editing the index url). + ds = Downloader(self._ds.url, self._ds.download_dir) + + # Start downloading in a separate thread. + assert self._download_msg_queue == [] + assert self._download_abort_queue == [] + self._DownloadThread( + ds, + marked, + self._download_lock, + self._download_msg_queue, + self._download_abort_queue, + ).start() + + # Monitor the download message queue & display its progress. + self._log_indent = 0 + self._downloading = True + self._monitor_message_queue() + + # Display an indication that we're still alive and well by + # cycling the progress bar. + self._progress_alive() + + def _abort_download(self): + if self._downloading: + self._download_lock.acquire() + self._download_abort_queue.append("abort") + self._download_lock.release() + + class _DownloadThread(threading.Thread): + def __init__(self, data_server, items, lock, message_queue, abort): + self.data_server = data_server + self.items = items + self.lock = lock + self.message_queue = message_queue + self.abort = abort + threading.Thread.__init__(self) + + def run(self): + for msg in self.data_server.incr_download(self.items): + self.lock.acquire() + self.message_queue.append(msg) + # Check if we've been told to kill ourselves: + if self.abort: + self.message_queue.append("aborted") + self.lock.release() + return + self.lock.release() + self.lock.acquire() + self.message_queue.append("finished") + self.lock.release() + + _MONITOR_QUEUE_DELAY = 100 + + def _monitor_message_queue(self): + def show(s): + self._progresslabel["text"] = s + self._log(s) + + # Try to acquire the lock; if it's busy, then just try again later. + if not self._download_lock.acquire(): + return + for msg in self._download_msg_queue: + + # Done downloading? + if msg == "finished" or msg == "aborted": + # self._fill_table(sort=False) + self._update_table_status() + self._downloading = False + self._download_button["text"] = "Download" + del self._download_msg_queue[:] + del self._download_abort_queue[:] + self._download_lock.release() + if msg == "aborted": + show("Download aborted!") + self._show_progress(None) + else: + afterid = self.top.after(100, self._show_progress, None) + self._afterid["_monitor_message_queue"] = afterid + return + + # All other messages + elif isinstance(msg, ProgressMessage): + self._show_progress(msg.progress) + elif isinstance(msg, ErrorMessage): + show(msg.message) + if msg.package is not None: + self._select(msg.package.id) + self._show_progress(None) + self._downloading = False + return # halt progress. + elif isinstance(msg, StartCollectionMessage): + show("Downloading collection %r" % msg.collection.id) + self._log_indent += 1 + elif isinstance(msg, StartPackageMessage): + self._ds.clear_status_cache(msg.package.id) + show("Downloading package %r" % msg.package.id) + elif isinstance(msg, UpToDateMessage): + show("Package %s is up-to-date!" % msg.package.id) + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt; updating it' % + # msg.package.id) + elif isinstance(msg, FinishDownloadMessage): + show("Finished downloading %r." % msg.package.id) + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s" % msg.package.filename) + elif isinstance(msg, FinishUnzipMessage): + show("Finished installing %s" % msg.package.id) + elif isinstance(msg, FinishCollectionMessage): + self._log_indent -= 1 + show("Finished downloading collection %r." % msg.collection.id) + self._clear_mark(msg.collection.id) + elif isinstance(msg, FinishPackageMessage): + self._update_table_status() + self._clear_mark(msg.package.id) + + # Let the user know when we're aborting a download (but + # waiting for a good point to abort it, so we don't end up + # with a partially unzipped package or anything like that). + if self._download_abort_queue: + self._progresslabel["text"] = "Aborting download..." + + # Clear the message queue and then release the lock + del self._download_msg_queue[:] + self._download_lock.release() + + # Check the queue again after MONITOR_QUEUE_DELAY msec. + afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue) + self._afterid["_monitor_message_queue"] = afterid + + +###################################################################### +# Helper Functions +###################################################################### +# [xx] It may make sense to move these to nltk.internals. + + +def md5_hexdigest(file): + """ + Calculate and return the MD5 checksum for a given file. + ``file`` may either be a filename or an open stream. + """ + if isinstance(file, str): + with open(file, "rb") as infile: + return _md5_hexdigest(infile) + return _md5_hexdigest(file) + + +def _md5_hexdigest(fp): + md5_digest = md5() + while True: + block = fp.read(1024 * 16) # 16k blocks + if not block: + break + md5_digest.update(block) + return md5_digest.hexdigest() + + +# change this to periodically yield progress messages? +# [xx] get rid of topdir parameter -- we should be checking +# this when we build the index, anyway. +def unzip(filename, root, verbose=True): + """ + Extract the contents of the zip file ``filename`` into the + directory ``root``. + """ + for message in _unzip_iter(filename, root, verbose): + if isinstance(message, ErrorMessage): + raise Exception(message) + + +def _unzip_iter(filename, root, verbose=True): + if verbose: + sys.stdout.write("Unzipping %s" % os.path.split(filename)[1]) + sys.stdout.flush() + + try: + zf = zipfile.ZipFile(filename) + except zipfile.error as e: + yield ErrorMessage(filename, "Error with downloaded zip file") + return + except Exception as e: + yield ErrorMessage(filename, e) + return + + zf.extractall(root) + + if verbose: + print() + + +###################################################################### +# Index Builder +###################################################################### +# This may move to a different file sometime. + + +def build_index(root, base_url): + """ + Create a new data.xml index file, by combining the xml description + files for various packages and collections. ``root`` should be the + path to a directory containing the package xml and zip files; and + the collection xml files. The ``root`` directory is expected to + have the following subdirectories:: + + root/ + packages/ .................. subdirectory for packages + corpora/ ................. zip & xml files for corpora + grammars/ ................ zip & xml files for grammars + taggers/ ................. zip & xml files for taggers + tokenizers/ .............. zip & xml files for tokenizers + etc. + collections/ ............... xml files for collections + + For each package, there should be two files: ``package.zip`` + (where *package* is the package name) + which contains the package itself as a compressed zip file; and + ``package.xml``, which is an xml description of the package. The + zipfile ``package.zip`` should expand to a single subdirectory + named ``package/``. The base filename ``package`` must match + the identifier given in the package's xml file. + + For each collection, there should be a single file ``collection.zip`` + describing the collection, where *collection* is the name of the collection. + + All identifiers (for both packages and collections) must be unique. + """ + # Find all packages. + packages = [] + for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")): + zipstat = os.stat(zf.filename) + url = f"{base_url}/{subdir}/{os.path.split(zf.filename)[1]}" + unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist()) + + # Fill in several fields of the package xml with calculated values. + pkg_xml.set("unzipped_size", "%s" % unzipped_size) + pkg_xml.set("size", "%s" % zipstat.st_size) + pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename)) + pkg_xml.set("subdir", subdir) + # pkg_xml.set('svn_revision', _svn_revision(zf.filename)) + if not pkg_xml.get("url"): + pkg_xml.set("url", url) + + # Record the package. + packages.append(pkg_xml) + + # Find all collections + collections = list(_find_collections(os.path.join(root, "collections"))) + + # Check that all UIDs are unique + uids = set() + for item in packages + collections: + if item.get("id") in uids: + raise ValueError("Duplicate UID: %s" % item.get("id")) + uids.add(item.get("id")) + + # Put it all together + top_elt = ElementTree.Element("nltk_data") + top_elt.append(ElementTree.Element("packages")) + top_elt[0].extend(sorted(packages, key=lambda package: package.get("id"))) + top_elt.append(ElementTree.Element("collections")) + top_elt[1].extend(sorted(collections, key=lambda collection: collection.get("id"))) + + _indent_xml(top_elt) + return top_elt + + +def _indent_xml(xml, prefix=""): + """ + Helper for ``build_index()``: Given an XML ``ElementTree``, modify it + (and its descendents) ``text`` and ``tail`` attributes to generate + an indented tree, where each nested element is indented by 2 + spaces with respect to its parent. + """ + if len(xml) > 0: + xml.text = (xml.text or "").strip() + "\n" + prefix + " " + for child in xml: + _indent_xml(child, prefix + " ") + for child in xml[:-1]: + child.tail = (child.tail or "").strip() + "\n" + prefix + " " + xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix + + +def _check_package(pkg_xml, zipfilename, zf): + """ + Helper for ``build_index()``: Perform some checks to make sure that + the given package is consistent. + """ + # The filename must patch the id given in the XML file. + uid = os.path.splitext(os.path.split(zipfilename)[1])[0] + if pkg_xml.get("id") != uid: + raise ValueError( + "package identifier mismatch ({} vs {})".format(pkg_xml.get("id"), uid) + ) + + # Zip file must expand to a subdir whose name matches uid. + if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()): + raise ValueError( + "Zipfile %s.zip does not expand to a single " + "subdirectory %s/" % (uid, uid) + ) + + +# update for git? +def _svn_revision(filename): + """ + Helper for ``build_index()``: Calculate the subversion revision + number for a given file (by using ``subprocess`` to run ``svn``). + """ + p = subprocess.Popen( + ["svn", "status", "-v", filename], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + (stdout, stderr) = p.communicate() + if p.returncode != 0 or stderr or not stdout: + raise ValueError( + "Error determining svn_revision for %s: %s" + % (os.path.split(filename)[1], textwrap.fill(stderr)) + ) + return stdout.split()[2] + + +def _find_collections(root): + """ + Helper for ``build_index()``: Yield a list of ElementTree.Element + objects, each holding the xml for a single package collection. + """ + for dirname, _subdirs, files in os.walk(root): + for filename in files: + if filename.endswith(".xml"): + xmlfile = os.path.join(dirname, filename) + yield ElementTree.parse(xmlfile).getroot() + + +def _find_packages(root): + """ + Helper for ``build_index()``: Yield a list of tuples + ``(pkg_xml, zf, subdir)``, where: + - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a + package + - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. + - ``subdir`` is the subdirectory (relative to ``root``) where + the package was found (e.g. 'corpora' or 'grammars'). + """ + from nltk.corpus.reader.util import _path_from + + # Find all packages. + packages = [] + for dirname, subdirs, files in os.walk(root): + relpath = "/".join(_path_from(root, dirname)) + for filename in files: + if filename.endswith(".xml"): + xmlfilename = os.path.join(dirname, filename) + zipfilename = xmlfilename[:-4] + ".zip" + try: + zf = zipfile.ZipFile(zipfilename) + except Exception as e: + raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e + try: + pkg_xml = ElementTree.parse(xmlfilename).getroot() + except Exception as e: + raise ValueError(f"Error reading file {xmlfilename!r}!\n{e}") from e + + # Check that the UID matches the filename + uid = os.path.split(xmlfilename[:-4])[1] + if pkg_xml.get("id") != uid: + raise ValueError( + "package identifier mismatch (%s " + "vs %s)" % (pkg_xml.get("id"), uid) + ) + + # Check that the zipfile expands to a subdir whose + # name matches the uid. + if sum( + (name != uid and not name.startswith(uid + "/")) + for name in zf.namelist() + ): + raise ValueError( + "Zipfile %s.zip does not expand to a " + "single subdirectory %s/" % (uid, uid) + ) + + yield pkg_xml, zf, relpath + + elif filename.endswith(".zip"): + # Warn user in case a .xml does not exist for a .zip + resourcename = os.path.splitext(filename)[0] + xmlfilename = os.path.join(dirname, resourcename + ".xml") + if not os.path.exists(xmlfilename): + warnings.warn( + f"{filename} exists, but {resourcename + '.xml'} cannot be found! " + f"This could mean that {resourcename} can not be downloaded.", + stacklevel=2, + ) + + # Don't recurse into svn subdirectories: + try: + subdirs.remove(".svn") + except ValueError: + pass + + +###################################################################### +# Main: +###################################################################### + +# There should be a command-line interface + +# Aliases +_downloader = Downloader() +download = _downloader.download + + +def download_shell(): + DownloaderShell(_downloader).run() + + +def download_gui(): + DownloaderGUI(_downloader).mainloop() + + +def update(): + _downloader.update() + + +if __name__ == "__main__": + from optparse import OptionParser + + parser = OptionParser() + parser.add_option( + "-d", + "--dir", + dest="dir", + help="download package to directory DIR", + metavar="DIR", + ) + parser.add_option( + "-q", + "--quiet", + dest="quiet", + action="store_true", + default=False, + help="work quietly", + ) + parser.add_option( + "-f", + "--force", + dest="force", + action="store_true", + default=False, + help="download even if already installed", + ) + parser.add_option( + "-e", + "--exit-on-error", + dest="halt_on_error", + action="store_true", + default=False, + help="exit if an error occurs", + ) + parser.add_option( + "-u", + "--url", + dest="server_index_url", + default=os.environ.get("NLTK_DOWNLOAD_URL"), + help="download server index url", + ) + + (options, args) = parser.parse_args() + + downloader = Downloader(server_index_url=options.server_index_url) + + if args: + for pkg_id in args: + rv = downloader.download( + info_or_id=pkg_id, + download_dir=options.dir, + quiet=options.quiet, + force=options.force, + halt_on_error=options.halt_on_error, + ) + if rv == False and options.halt_on_error: + break + else: + downloader.download( + download_dir=options.dir, + quiet=options.quiet, + force=options.force, + halt_on_error=options.halt_on_error, + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/table.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/table.py new file mode 100644 index 0000000000000000000000000000000000000000..e4e5d300e9cad96b367d22cd90017b384b36a1e5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/table.py @@ -0,0 +1,1177 @@ +# Natural Language Toolkit: Table widget +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Tkinter widgets for displaying multi-column listboxes and tables. +""" + +import operator +from tkinter import Frame, Label, Listbox, Scrollbar, Tk + +###################################################################### +# Multi-Column Listbox +###################################################################### + + +class MultiListbox(Frame): + """ + A multi-column listbox, where the current selection applies to an + entire row. Based on the MultiListbox Tkinter widget + recipe from the Python Cookbook (https://code.activestate.com/recipes/52266/) + + For the most part, ``MultiListbox`` methods delegate to its + contained listboxes. For any methods that do not have docstrings, + see ``Tkinter.Listbox`` for a description of what that method does. + """ + + # ///////////////////////////////////////////////////////////////// + # Configuration + # ///////////////////////////////////////////////////////////////// + + #: Default configuration values for the frame. + FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1) + + #: Default configurations for the column labels. + LABEL_CONFIG = dict( + borderwidth=1, + relief="raised", + font="helvetica -16 bold", + background="#444", + foreground="white", + ) + + #: Default configuration for the column listboxes. + LISTBOX_CONFIG = dict( + borderwidth=1, + selectborderwidth=0, + highlightthickness=0, + exportselection=False, + selectbackground="#888", + activestyle="none", + takefocus=False, + ) + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__(self, master, columns, column_weights=None, cnf={}, **kw): + """ + Construct a new multi-column listbox widget. + + :param master: The widget that should contain the new + multi-column listbox. + + :param columns: Specifies what columns should be included in + the new multi-column listbox. If ``columns`` is an integer, + then it is the number of columns to include. If it is + a list, then its length indicates the number of columns + to include; and each element of the list will be used as + a label for the corresponding column. + + :param cnf, kw: Configuration parameters for this widget. + Use ``label_*`` to configure all labels; and ``listbox_*`` + to configure all listboxes. E.g.: + >>> root = Tk() # doctest: +SKIP + >>> MultiListbox(root, ["Subject", "Sender", "Date"], label_foreground='red').pack() # doctest: +SKIP + """ + # If columns was specified as an int, convert it to a list. + if isinstance(columns, int): + columns = list(range(columns)) + include_labels = False + else: + include_labels = True + + if len(columns) == 0: + raise ValueError("Expected at least one column") + + # Instance variables + self._column_names = tuple(columns) + self._listboxes = [] + self._labels = [] + + # Pick a default value for column_weights, if none was specified. + if column_weights is None: + column_weights = [1] * len(columns) + elif len(column_weights) != len(columns): + raise ValueError("Expected one column_weight for each column") + self._column_weights = column_weights + + # Configure our widgets. + Frame.__init__(self, master, **self.FRAME_CONFIG) + self.grid_rowconfigure(1, weight=1) + for i, label in enumerate(self._column_names): + self.grid_columnconfigure(i, weight=column_weights[i]) + + # Create a label for the column + if include_labels: + l = Label(self, text=label, **self.LABEL_CONFIG) + self._labels.append(l) + l.grid(column=i, row=0, sticky="news", padx=0, pady=0) + l.column_index = i + + # Create a listbox for the column + lb = Listbox(self, **self.LISTBOX_CONFIG) + self._listboxes.append(lb) + lb.grid(column=i, row=1, sticky="news", padx=0, pady=0) + lb.column_index = i + + # Clicking or dragging selects: + lb.bind("", self._select) + lb.bind("", self._select) + # Scroll wheel scrolls: + lb.bind("", lambda e: self._scroll(-1)) + lb.bind("", lambda e: self._scroll(+1)) + lb.bind("", lambda e: self._scroll(e.delta)) + # Button 2 can be used to scan: + lb.bind("", lambda e: self.scan_mark(e.x, e.y)) + lb.bind("", lambda e: self.scan_dragto(e.x, e.y)) + # Dragging outside the window has no effect (disable + # the default listbox behavior, which scrolls): + lb.bind("", lambda e: "break") + # Columns can be resized by dragging them: + lb.bind("", self._resize_column) + + # Columns can be resized by dragging them. (This binding is + # used if they click on the grid between columns:) + self.bind("", self._resize_column) + + # Set up key bindings for the widget: + self.bind("", lambda e: self.select(delta=-1)) + self.bind("", lambda e: self.select(delta=1)) + self.bind("", lambda e: self.select(delta=-self._pagesize())) + self.bind("", lambda e: self.select(delta=self._pagesize())) + + # Configuration customizations + self.configure(cnf, **kw) + + # ///////////////////////////////////////////////////////////////// + # Column Resizing + # ///////////////////////////////////////////////////////////////// + + def _resize_column(self, event): + """ + Callback used to resize a column of the table. Return ``True`` + if the column is actually getting resized (if the user clicked + on the far left or far right 5 pixels of a label); and + ``False`` otherwies. + """ + # If we're already waiting for a button release, then ignore + # the new button press. + if event.widget.bind(""): + return False + + # Decide which column (if any) to resize. + self._resize_column_index = None + if event.widget is self: + for i, lb in enumerate(self._listboxes): + if abs(event.x - (lb.winfo_x() + lb.winfo_width())) < 10: + self._resize_column_index = i + elif event.x > (event.widget.winfo_width() - 5): + self._resize_column_index = event.widget.column_index + elif event.x < 5 and event.widget.column_index != 0: + self._resize_column_index = event.widget.column_index - 1 + + # Bind callbacks that are used to resize it. + if self._resize_column_index is not None: + event.widget.bind("", self._resize_column_motion_cb) + event.widget.bind( + "" % event.num, self._resize_column_buttonrelease_cb + ) + return True + else: + return False + + def _resize_column_motion_cb(self, event): + lb = self._listboxes[self._resize_column_index] + charwidth = lb.winfo_width() / lb["width"] + + x1 = event.x + event.widget.winfo_x() + x2 = lb.winfo_x() + lb.winfo_width() + + lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth) + + def _resize_column_buttonrelease_cb(self, event): + event.widget.unbind("" % event.num) + event.widget.unbind("") + + # ///////////////////////////////////////////////////////////////// + # Properties + # ///////////////////////////////////////////////////////////////// + + @property + def column_names(self): + """ + A tuple containing the names of the columns used by this + multi-column listbox. + """ + return self._column_names + + @property + def column_labels(self): + """ + A tuple containing the ``Tkinter.Label`` widgets used to + display the label of each column. If this multi-column + listbox was created without labels, then this will be an empty + tuple. These widgets will all be augmented with a + ``column_index`` attribute, which can be used to determine + which column they correspond to. This can be convenient, + e.g., when defining callbacks for bound events. + """ + return tuple(self._labels) + + @property + def listboxes(self): + """ + A tuple containing the ``Tkinter.Listbox`` widgets used to + display individual columns. These widgets will all be + augmented with a ``column_index`` attribute, which can be used + to determine which column they correspond to. This can be + convenient, e.g., when defining callbacks for bound events. + """ + return tuple(self._listboxes) + + # ///////////////////////////////////////////////////////////////// + # Mouse & Keyboard Callback Functions + # ///////////////////////////////////////////////////////////////// + + def _select(self, e): + i = e.widget.nearest(e.y) + self.selection_clear(0, "end") + self.selection_set(i) + self.activate(i) + self.focus() + + def _scroll(self, delta): + for lb in self._listboxes: + lb.yview_scroll(delta, "unit") + return "break" + + def _pagesize(self): + """:return: The number of rows that makes up one page""" + return int(self.index("@0,1000000")) - int(self.index("@0,0")) + + # ///////////////////////////////////////////////////////////////// + # Row selection + # ///////////////////////////////////////////////////////////////// + + def select(self, index=None, delta=None, see=True): + """ + Set the selected row. If ``index`` is specified, then select + row ``index``. Otherwise, if ``delta`` is specified, then move + the current selection by ``delta`` (negative numbers for up, + positive numbers for down). This will not move the selection + past the top or the bottom of the list. + + :param see: If true, then call ``self.see()`` with the newly + selected index, to ensure that it is visible. + """ + if (index is not None) and (delta is not None): + raise ValueError("specify index or delta, but not both") + + # If delta was given, then calculate index. + if delta is not None: + if len(self.curselection()) == 0: + index = -1 + delta + else: + index = int(self.curselection()[0]) + delta + + # Clear all selected rows. + self.selection_clear(0, "end") + + # Select the specified index + if index is not None: + index = min(max(index, 0), self.size() - 1) + # self.activate(index) + self.selection_set(index) + if see: + self.see(index) + + # ///////////////////////////////////////////////////////////////// + # Configuration + # ///////////////////////////////////////////////////////////////// + + def configure(self, cnf={}, **kw): + """ + Configure this widget. Use ``label_*`` to configure all + labels; and ``listbox_*`` to configure all listboxes. E.g.: + + >>> master = Tk() # doctest: +SKIP + >>> mlb = MultiListbox(master, 5) # doctest: +SKIP + >>> mlb.configure(label_foreground='red') # doctest: +SKIP + >>> mlb.configure(listbox_foreground='red') # doctest: +SKIP + """ + cnf = dict(list(cnf.items()) + list(kw.items())) + for (key, val) in list(cnf.items()): + if key.startswith("label_") or key.startswith("label-"): + for label in self._labels: + label.configure({key[6:]: val}) + elif key.startswith("listbox_") or key.startswith("listbox-"): + for listbox in self._listboxes: + listbox.configure({key[8:]: val}) + else: + Frame.configure(self, {key: val}) + + def __setitem__(self, key, val): + """ + Configure this widget. This is equivalent to + ``self.configure({key,val``)}. See ``configure()``. + """ + self.configure({key: val}) + + def rowconfigure(self, row_index, cnf={}, **kw): + """ + Configure all table cells in the given row. Valid keyword + arguments are: ``background``, ``bg``, ``foreground``, ``fg``, + ``selectbackground``, ``selectforeground``. + """ + for lb in self._listboxes: + lb.itemconfigure(row_index, cnf, **kw) + + def columnconfigure(self, col_index, cnf={}, **kw): + """ + Configure all table cells in the given column. Valid keyword + arguments are: ``background``, ``bg``, ``foreground``, ``fg``, + ``selectbackground``, ``selectforeground``. + """ + lb = self._listboxes[col_index] + + cnf = dict(list(cnf.items()) + list(kw.items())) + for (key, val) in list(cnf.items()): + if key in ( + "background", + "bg", + "foreground", + "fg", + "selectbackground", + "selectforeground", + ): + for i in range(lb.size()): + lb.itemconfigure(i, {key: val}) + else: + lb.configure({key: val}) + + def itemconfigure(self, row_index, col_index, cnf=None, **kw): + """ + Configure the table cell at the given row and column. Valid + keyword arguments are: ``background``, ``bg``, ``foreground``, + ``fg``, ``selectbackground``, ``selectforeground``. + """ + lb = self._listboxes[col_index] + return lb.itemconfigure(row_index, cnf, **kw) + + # ///////////////////////////////////////////////////////////////// + # Value Access + # ///////////////////////////////////////////////////////////////// + + def insert(self, index, *rows): + """ + Insert the given row or rows into the table, at the given + index. Each row value should be a tuple of cell values, one + for each column in the row. Index may be an integer or any of + the special strings (such as ``'end'``) accepted by + ``Tkinter.Listbox``. + """ + for elt in rows: + if len(elt) != len(self._column_names): + raise ValueError( + "rows should be tuples whose length " + "is equal to the number of columns" + ) + for (lb, elts) in zip(self._listboxes, list(zip(*rows))): + lb.insert(index, *elts) + + def get(self, first, last=None): + """ + Return the value(s) of the specified row(s). If ``last`` is + not specified, then return a single row value; otherwise, + return a list of row values. Each row value is a tuple of + cell values, one for each column in the row. + """ + values = [lb.get(first, last) for lb in self._listboxes] + if last: + return [tuple(row) for row in zip(*values)] + else: + return tuple(values) + + def bbox(self, row, col): + """ + Return the bounding box for the given table cell, relative to + this widget's top-left corner. The bounding box is a tuple + of integers ``(left, top, width, height)``. + """ + dx, dy, _, _ = self.grid_bbox(row=0, column=col) + x, y, w, h = self._listboxes[col].bbox(row) + return int(x) + int(dx), int(y) + int(dy), int(w), int(h) + + # ///////////////////////////////////////////////////////////////// + # Hide/Show Columns + # ///////////////////////////////////////////////////////////////// + + def hide_column(self, col_index): + """ + Hide the given column. The column's state is still + maintained: its values will still be returned by ``get()``, and + you must supply its values when calling ``insert()``. It is + safe to call this on a column that is already hidden. + + :see: ``show_column()`` + """ + if self._labels: + self._labels[col_index].grid_forget() + self.listboxes[col_index].grid_forget() + self.grid_columnconfigure(col_index, weight=0) + + def show_column(self, col_index): + """ + Display a column that has been hidden using ``hide_column()``. + It is safe to call this on a column that is not hidden. + """ + weight = self._column_weights[col_index] + if self._labels: + self._labels[col_index].grid( + column=col_index, row=0, sticky="news", padx=0, pady=0 + ) + self._listboxes[col_index].grid( + column=col_index, row=1, sticky="news", padx=0, pady=0 + ) + self.grid_columnconfigure(col_index, weight=weight) + + # ///////////////////////////////////////////////////////////////// + # Binding Methods + # ///////////////////////////////////////////////////////////////// + + def bind_to_labels(self, sequence=None, func=None, add=None): + """ + Add a binding to each ``Tkinter.Label`` widget in this + mult-column listbox that will call ``func`` in response to the + event sequence. + + :return: A list of the identifiers of replaced binding + functions (if any), allowing for their deletion (to + prevent a memory leak). + """ + return [label.bind(sequence, func, add) for label in self.column_labels] + + def bind_to_listboxes(self, sequence=None, func=None, add=None): + """ + Add a binding to each ``Tkinter.Listbox`` widget in this + mult-column listbox that will call ``func`` in response to the + event sequence. + + :return: A list of the identifiers of replaced binding + functions (if any), allowing for their deletion (to + prevent a memory leak). + """ + for listbox in self.listboxes: + listbox.bind(sequence, func, add) + + def bind_to_columns(self, sequence=None, func=None, add=None): + """ + Add a binding to each ``Tkinter.Label`` and ``Tkinter.Listbox`` + widget in this mult-column listbox that will call ``func`` in + response to the event sequence. + + :return: A list of the identifiers of replaced binding + functions (if any), allowing for their deletion (to + prevent a memory leak). + """ + return self.bind_to_labels(sequence, func, add) + self.bind_to_listboxes( + sequence, func, add + ) + + # ///////////////////////////////////////////////////////////////// + # Simple Delegation + # ///////////////////////////////////////////////////////////////// + + # These methods delegate to the first listbox: + def curselection(self, *args, **kwargs): + return self._listboxes[0].curselection(*args, **kwargs) + + def selection_includes(self, *args, **kwargs): + return self._listboxes[0].selection_includes(*args, **kwargs) + + def itemcget(self, *args, **kwargs): + return self._listboxes[0].itemcget(*args, **kwargs) + + def size(self, *args, **kwargs): + return self._listboxes[0].size(*args, **kwargs) + + def index(self, *args, **kwargs): + return self._listboxes[0].index(*args, **kwargs) + + def nearest(self, *args, **kwargs): + return self._listboxes[0].nearest(*args, **kwargs) + + # These methods delegate to each listbox (and return None): + def activate(self, *args, **kwargs): + for lb in self._listboxes: + lb.activate(*args, **kwargs) + + def delete(self, *args, **kwargs): + for lb in self._listboxes: + lb.delete(*args, **kwargs) + + def scan_mark(self, *args, **kwargs): + for lb in self._listboxes: + lb.scan_mark(*args, **kwargs) + + def scan_dragto(self, *args, **kwargs): + for lb in self._listboxes: + lb.scan_dragto(*args, **kwargs) + + def see(self, *args, **kwargs): + for lb in self._listboxes: + lb.see(*args, **kwargs) + + def selection_anchor(self, *args, **kwargs): + for lb in self._listboxes: + lb.selection_anchor(*args, **kwargs) + + def selection_clear(self, *args, **kwargs): + for lb in self._listboxes: + lb.selection_clear(*args, **kwargs) + + def selection_set(self, *args, **kwargs): + for lb in self._listboxes: + lb.selection_set(*args, **kwargs) + + def yview(self, *args, **kwargs): + for lb in self._listboxes: + v = lb.yview(*args, **kwargs) + return v # if called with no arguments + + def yview_moveto(self, *args, **kwargs): + for lb in self._listboxes: + lb.yview_moveto(*args, **kwargs) + + def yview_scroll(self, *args, **kwargs): + for lb in self._listboxes: + lb.yview_scroll(*args, **kwargs) + + # ///////////////////////////////////////////////////////////////// + # Aliases + # ///////////////////////////////////////////////////////////////// + + itemconfig = itemconfigure + rowconfig = rowconfigure + columnconfig = columnconfigure + select_anchor = selection_anchor + select_clear = selection_clear + select_includes = selection_includes + select_set = selection_set + + # ///////////////////////////////////////////////////////////////// + # These listbox methods are not defined for multi-listbox + # ///////////////////////////////////////////////////////////////// + # def xview(self, *what): pass + # def xview_moveto(self, fraction): pass + # def xview_scroll(self, number, what): pass + + +###################################################################### +# Table +###################################################################### + + +class Table: + """ + A display widget for a table of values, based on a ``MultiListbox`` + widget. For many purposes, ``Table`` can be treated as a + list-of-lists. E.g., table[i] is a list of the values for row i; + and table.append(row) adds a new row with the given list of + values. Individual cells can be accessed using table[i,j], which + refers to the j-th column of the i-th row. This can be used to + both read and write values from the table. E.g.: + + >>> table[i,j] = 'hello' # doctest: +SKIP + + The column (j) can be given either as an index number, or as a + column name. E.g., the following prints the value in the 3rd row + for the 'First Name' column: + + >>> print(table[3, 'First Name']) # doctest: +SKIP + John + + You can configure the colors for individual rows, columns, or + cells using ``rowconfig()``, ``columnconfig()``, and ``itemconfig()``. + The color configuration for each row will be preserved if the + table is modified; however, when new rows are added, any color + configurations that have been made for *columns* will not be + applied to the new row. + + Note: Although ``Table`` acts like a widget in some ways (e.g., it + defines ``grid()``, ``pack()``, and ``bind()``), it is not itself a + widget; it just contains one. This is because widgets need to + define ``__getitem__()``, ``__setitem__()``, and ``__nonzero__()`` in + a way that's incompatible with the fact that ``Table`` behaves as a + list-of-lists. + + :ivar _mlb: The multi-column listbox used to display this table's data. + :ivar _rows: A list-of-lists used to hold the cell values of this + table. Each element of _rows is a row value, i.e., a list of + cell values, one for each column in the row. + """ + + def __init__( + self, + master, + column_names, + rows=None, + column_weights=None, + scrollbar=True, + click_to_sort=True, + reprfunc=None, + cnf={}, + **kw + ): + """ + Construct a new Table widget. + + :type master: Tkinter.Widget + :param master: The widget that should contain the new table. + :type column_names: list(str) + :param column_names: A list of names for the columns; these + names will be used to create labels for each column; + and can be used as an index when reading or writing + cell values from the table. + :type rows: list(list) + :param rows: A list of row values used to initialize the table. + Each row value should be a tuple of cell values, one for + each column in the row. + :type scrollbar: bool + :param scrollbar: If true, then create a scrollbar for the + new table widget. + :type click_to_sort: bool + :param click_to_sort: If true, then create bindings that will + sort the table's rows by a given column's values if the + user clicks on that colum's label. + :type reprfunc: function + :param reprfunc: If specified, then use this function to + convert each table cell value to a string suitable for + display. ``reprfunc`` has the following signature: + reprfunc(row_index, col_index, cell_value) -> str + (Note that the column is specified by index, not by name.) + :param cnf, kw: Configuration parameters for this widget's + contained ``MultiListbox``. See ``MultiListbox.__init__()`` + for details. + """ + self._num_columns = len(column_names) + self._reprfunc = reprfunc + self._frame = Frame(master) + + self._column_name_to_index = {c: i for (i, c) in enumerate(column_names)} + + # Make a copy of the rows & check that it's valid. + if rows is None: + self._rows = [] + else: + self._rows = [[v for v in row] for row in rows] + for row in self._rows: + self._checkrow(row) + + # Create our multi-list box. + self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw) + self._mlb.pack(side="left", expand=True, fill="both") + + # Optional scrollbar + if scrollbar: + sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview) + self._mlb.listboxes[0]["yscrollcommand"] = sb.set + # for listbox in self._mlb.listboxes: + # listbox['yscrollcommand'] = sb.set + sb.pack(side="right", fill="y") + self._scrollbar = sb + + # Set up sorting + self._sortkey = None + if click_to_sort: + for i, l in enumerate(self._mlb.column_labels): + l.bind("", self._sort) + + # Fill in our multi-list box. + self._fill_table() + + # ///////////////////////////////////////////////////////////////// + # { Widget-like Methods + # ///////////////////////////////////////////////////////////////// + # These all just delegate to either our frame or our MLB. + + def pack(self, *args, **kwargs): + """Position this table's main frame widget in its parent + widget. See ``Tkinter.Frame.pack()`` for more info.""" + self._frame.pack(*args, **kwargs) + + def grid(self, *args, **kwargs): + """Position this table's main frame widget in its parent + widget. See ``Tkinter.Frame.grid()`` for more info.""" + self._frame.grid(*args, **kwargs) + + def focus(self): + """Direct (keyboard) input foxus to this widget.""" + self._mlb.focus() + + def bind(self, sequence=None, func=None, add=None): + """Add a binding to this table's main frame that will call + ``func`` in response to the event sequence.""" + self._mlb.bind(sequence, func, add) + + def rowconfigure(self, row_index, cnf={}, **kw): + """:see: ``MultiListbox.rowconfigure()``""" + self._mlb.rowconfigure(row_index, cnf, **kw) + + def columnconfigure(self, col_index, cnf={}, **kw): + """:see: ``MultiListbox.columnconfigure()``""" + col_index = self.column_index(col_index) + self._mlb.columnconfigure(col_index, cnf, **kw) + + def itemconfigure(self, row_index, col_index, cnf=None, **kw): + """:see: ``MultiListbox.itemconfigure()``""" + col_index = self.column_index(col_index) + return self._mlb.itemconfigure(row_index, col_index, cnf, **kw) + + def bind_to_labels(self, sequence=None, func=None, add=None): + """:see: ``MultiListbox.bind_to_labels()``""" + return self._mlb.bind_to_labels(sequence, func, add) + + def bind_to_listboxes(self, sequence=None, func=None, add=None): + """:see: ``MultiListbox.bind_to_listboxes()``""" + return self._mlb.bind_to_listboxes(sequence, func, add) + + def bind_to_columns(self, sequence=None, func=None, add=None): + """:see: ``MultiListbox.bind_to_columns()``""" + return self._mlb.bind_to_columns(sequence, func, add) + + rowconfig = rowconfigure + columnconfig = columnconfigure + itemconfig = itemconfigure + + # ///////////////////////////////////////////////////////////////// + # { Table as list-of-lists + # ///////////////////////////////////////////////////////////////// + + def insert(self, row_index, rowvalue): + """ + Insert a new row into the table, so that its row index will be + ``row_index``. If the table contains any rows whose row index + is greater than or equal to ``row_index``, then they will be + shifted down. + + :param rowvalue: A tuple of cell values, one for each column + in the new row. + """ + self._checkrow(rowvalue) + self._rows.insert(row_index, rowvalue) + if self._reprfunc is not None: + rowvalue = [ + self._reprfunc(row_index, j, v) for (j, v) in enumerate(rowvalue) + ] + self._mlb.insert(row_index, rowvalue) + if self._DEBUG: + self._check_table_vs_mlb() + + def extend(self, rowvalues): + """ + Add new rows at the end of the table. + + :param rowvalues: A list of row values used to initialize the + table. Each row value should be a tuple of cell values, + one for each column in the row. + """ + for rowvalue in rowvalues: + self.append(rowvalue) + if self._DEBUG: + self._check_table_vs_mlb() + + def append(self, rowvalue): + """ + Add a new row to the end of the table. + + :param rowvalue: A tuple of cell values, one for each column + in the new row. + """ + self.insert(len(self._rows), rowvalue) + if self._DEBUG: + self._check_table_vs_mlb() + + def clear(self): + """ + Delete all rows in this table. + """ + self._rows = [] + self._mlb.delete(0, "end") + if self._DEBUG: + self._check_table_vs_mlb() + + def __getitem__(self, index): + """ + Return the value of a row or a cell in this table. If + ``index`` is an integer, then the row value for the ``index``th + row. This row value consists of a tuple of cell values, one + for each column in the row. If ``index`` is a tuple of two + integers, ``(i,j)``, then return the value of the cell in the + ``i``th row and the ``j``th column. + """ + if isinstance(index, slice): + raise ValueError("Slicing not supported") + elif isinstance(index, tuple) and len(index) == 2: + return self._rows[index[0]][self.column_index(index[1])] + else: + return tuple(self._rows[index]) + + def __setitem__(self, index, val): + """ + Replace the value of a row or a cell in this table with + ``val``. + + If ``index`` is an integer, then ``val`` should be a row value + (i.e., a tuple of cell values, one for each column). In this + case, the values of the ``index``th row of the table will be + replaced with the values in ``val``. + + If ``index`` is a tuple of integers, ``(i,j)``, then replace the + value of the cell in the ``i``th row and ``j``th column with + ``val``. + """ + if isinstance(index, slice): + raise ValueError("Slicing not supported") + + # table[i,j] = val + elif isinstance(index, tuple) and len(index) == 2: + i, j = index[0], self.column_index(index[1]) + config_cookie = self._save_config_info([i]) + self._rows[i][j] = val + if self._reprfunc is not None: + val = self._reprfunc(i, j, val) + self._mlb.listboxes[j].insert(i, val) + self._mlb.listboxes[j].delete(i + 1) + self._restore_config_info(config_cookie) + + # table[i] = val + else: + config_cookie = self._save_config_info([index]) + self._checkrow(val) + self._rows[index] = list(val) + if self._reprfunc is not None: + val = [self._reprfunc(index, j, v) for (j, v) in enumerate(val)] + self._mlb.insert(index, val) + self._mlb.delete(index + 1) + self._restore_config_info(config_cookie) + + def __delitem__(self, row_index): + """ + Delete the ``row_index``th row from this table. + """ + if isinstance(row_index, slice): + raise ValueError("Slicing not supported") + if isinstance(row_index, tuple) and len(row_index) == 2: + raise ValueError("Cannot delete a single cell!") + del self._rows[row_index] + self._mlb.delete(row_index) + if self._DEBUG: + self._check_table_vs_mlb() + + def __len__(self): + """ + :return: the number of rows in this table. + """ + return len(self._rows) + + def _checkrow(self, rowvalue): + """ + Helper function: check that a given row value has the correct + number of elements; and if not, raise an exception. + """ + if len(rowvalue) != self._num_columns: + raise ValueError( + "Row %r has %d columns; expected %d" + % (rowvalue, len(rowvalue), self._num_columns) + ) + + # ///////////////////////////////////////////////////////////////// + # Columns + # ///////////////////////////////////////////////////////////////// + + @property + def column_names(self): + """A list of the names of the columns in this table.""" + return self._mlb.column_names + + def column_index(self, i): + """ + If ``i`` is a valid column index integer, then return it as is. + Otherwise, check if ``i`` is used as the name for any column; + if so, return that column's index. Otherwise, raise a + ``KeyError`` exception. + """ + if isinstance(i, int) and 0 <= i < self._num_columns: + return i + else: + # This raises a key error if the column is not found. + return self._column_name_to_index[i] + + def hide_column(self, column_index): + """:see: ``MultiListbox.hide_column()``""" + self._mlb.hide_column(self.column_index(column_index)) + + def show_column(self, column_index): + """:see: ``MultiListbox.show_column()``""" + self._mlb.show_column(self.column_index(column_index)) + + # ///////////////////////////////////////////////////////////////// + # Selection + # ///////////////////////////////////////////////////////////////// + + def selected_row(self): + """ + Return the index of the currently selected row, or None if + no row is selected. To get the row value itself, use + ``table[table.selected_row()]``. + """ + sel = self._mlb.curselection() + if sel: + return int(sel[0]) + else: + return None + + def select(self, index=None, delta=None, see=True): + """:see: ``MultiListbox.select()``""" + self._mlb.select(index, delta, see) + + # ///////////////////////////////////////////////////////////////// + # Sorting + # ///////////////////////////////////////////////////////////////// + + def sort_by(self, column_index, order="toggle"): + """ + Sort the rows in this table, using the specified column's + values as a sort key. + + :param column_index: Specifies which column to sort, using + either a column index (int) or a column's label name + (str). + + :param order: Specifies whether to sort the values in + ascending or descending order: + + - ``'ascending'``: Sort from least to greatest. + - ``'descending'``: Sort from greatest to least. + - ``'toggle'``: If the most recent call to ``sort_by()`` + sorted the table by the same column (``column_index``), + then reverse the rows; otherwise sort in ascending + order. + """ + if order not in ("ascending", "descending", "toggle"): + raise ValueError( + 'sort_by(): order should be "ascending", ' '"descending", or "toggle".' + ) + column_index = self.column_index(column_index) + config_cookie = self._save_config_info(index_by_id=True) + + # Sort the rows. + if order == "toggle" and column_index == self._sortkey: + self._rows.reverse() + else: + self._rows.sort( + key=operator.itemgetter(column_index), reverse=(order == "descending") + ) + self._sortkey = column_index + + # Redraw the table. + self._fill_table() + self._restore_config_info(config_cookie, index_by_id=True, see=True) + if self._DEBUG: + self._check_table_vs_mlb() + + def _sort(self, event): + """Event handler for clicking on a column label -- sort by + that column.""" + column_index = event.widget.column_index + + # If they click on the far-left of far-right of a column's + # label, then resize rather than sorting. + if self._mlb._resize_column(event): + return "continue" + + # Otherwise, sort. + else: + self.sort_by(column_index) + return "continue" + + # ///////////////////////////////////////////////////////////////// + # { Table Drawing Helpers + # ///////////////////////////////////////////////////////////////// + + def _fill_table(self, save_config=True): + """ + Re-draw the table from scratch, by clearing out the table's + multi-column listbox; and then filling it in with values from + ``self._rows``. Note that any cell-, row-, or column-specific + color configuration that has been done will be lost. The + selection will also be lost -- i.e., no row will be selected + after this call completes. + """ + self._mlb.delete(0, "end") + for i, row in enumerate(self._rows): + if self._reprfunc is not None: + row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)] + self._mlb.insert("end", row) + + def _get_itemconfig(self, r, c): + return { + k: self._mlb.itemconfig(r, c, k)[-1] + for k in ( + "foreground", + "selectforeground", + "background", + "selectbackground", + ) + } + + def _save_config_info(self, row_indices=None, index_by_id=False): + """ + Return a 'cookie' containing information about which row is + selected, and what color configurations have been applied. + this information can the be re-applied to the table (after + making modifications) using ``_restore_config_info()``. Color + configuration information will be saved for any rows in + ``row_indices``, or in the entire table, if + ``row_indices=None``. If ``index_by_id=True``, the the cookie + will associate rows with their configuration information based + on the rows' python id. This is useful when performing + operations that re-arrange the rows (e.g. ``sort``). If + ``index_by_id=False``, then it is assumed that all rows will be + in the same order when ``_restore_config_info()`` is called. + """ + # Default value for row_indices is all rows. + if row_indices is None: + row_indices = list(range(len(self._rows))) + + # Look up our current selection. + selection = self.selected_row() + if index_by_id and selection is not None: + selection = id(self._rows[selection]) + + # Look up the color configuration info for each row. + if index_by_id: + config = { + id(self._rows[r]): [ + self._get_itemconfig(r, c) for c in range(self._num_columns) + ] + for r in row_indices + } + else: + config = { + r: [self._get_itemconfig(r, c) for c in range(self._num_columns)] + for r in row_indices + } + + return selection, config + + def _restore_config_info(self, cookie, index_by_id=False, see=False): + """ + Restore selection & color configuration information that was + saved using ``_save_config_info``. + """ + selection, config = cookie + + # Clear the selection. + if selection is None: + self._mlb.selection_clear(0, "end") + + # Restore selection & color config + if index_by_id: + for r, row in enumerate(self._rows): + if id(row) in config: + for c in range(self._num_columns): + self._mlb.itemconfigure(r, c, config[id(row)][c]) + if id(row) == selection: + self._mlb.select(r, see=see) + else: + if selection is not None: + self._mlb.select(selection, see=see) + for r in config: + for c in range(self._num_columns): + self._mlb.itemconfigure(r, c, config[r][c]) + + # ///////////////////////////////////////////////////////////////// + # Debugging (Invariant Checker) + # ///////////////////////////////////////////////////////////////// + + _DEBUG = False + """If true, then run ``_check_table_vs_mlb()`` after any operation + that modifies the table.""" + + def _check_table_vs_mlb(self): + """ + Verify that the contents of the table's ``_rows`` variable match + the contents of its multi-listbox (``_mlb``). This is just + included for debugging purposes, to make sure that the + list-modifying operations are working correctly. + """ + for col in self._mlb.listboxes: + assert len(self) == col.size() + for row in self: + assert len(row) == self._num_columns + assert self._num_columns == len(self._mlb.column_names) + # assert self._column_names == self._mlb.column_names + for i, row in enumerate(self): + for j, cell in enumerate(row): + if self._reprfunc is not None: + cell = self._reprfunc(i, j, cell) + assert self._mlb.get(i)[j] == cell + + +###################################################################### +# Demo/Test Function +###################################################################### + +# update this to use new WordNet API +def demo(): + root = Tk() + root.bind("", lambda e: root.destroy()) + + table = Table( + root, + "Word Synset Hypernym Hyponym".split(), + column_weights=[0, 1, 1, 1], + reprfunc=(lambda i, j, s: " %s" % s), + ) + table.pack(expand=True, fill="both") + + from nltk.corpus import brown, wordnet + + for word, pos in sorted(set(brown.tagged_words()[:500])): + if pos[0] != "N": + continue + word = word.lower() + for synset in wordnet.synsets(word): + try: + hyper_def = synset.hypernyms()[0].definition() + except: + hyper_def = "*none*" + try: + hypo_def = synset.hypernyms()[0].definition() + except: + hypo_def = "*none*" + table.append([word, synset.definition(), hyper_def, hypo_def]) + + table.columnconfig("Word", background="#afa") + table.columnconfig("Synset", background="#efe") + table.columnconfig("Hypernym", background="#fee") + table.columnconfig("Hyponym", background="#ffe") + for row in range(len(table)): + for column in ("Hypernym", "Hyponym"): + if table[row, column] == "*none*": + table.itemconfig( + row, column, foreground="#666", selectforeground="#666" + ) + root.mainloop() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/tree.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/tree.py new file mode 100644 index 0000000000000000000000000000000000000000..82394e489c68ac60176609cd275782b954dba7b1 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/tree.py @@ -0,0 +1,1129 @@ +# Natural Language Toolkit: Graphical Representations for Trees +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Graphically display a Tree. +""" + +from tkinter import IntVar, Menu, Tk + +from nltk.draw.util import ( + BoxWidget, + CanvasFrame, + CanvasWidget, + OvalWidget, + ParenWidget, + TextWidget, +) +from nltk.tree import Tree +from nltk.util import in_idle + +##////////////////////////////////////////////////////// +## Tree Segment +##////////////////////////////////////////////////////// + + +class TreeSegmentWidget(CanvasWidget): + """ + A canvas widget that displays a single segment of a hierarchical + tree. Each ``TreeSegmentWidget`` connects a single "node widget" + to a sequence of zero or more "subtree widgets". By default, the + bottom of the node is connected to the top of each subtree by a + single line. However, if the ``roof`` attribute is set, then a + single triangular "roof" will connect the node to all of its + children. + + Attributes: + - ``roof``: What sort of connection to draw between the node and + its subtrees. If ``roof`` is true, draw a single triangular + "roof" over the subtrees. If ``roof`` is false, draw a line + between each subtree and the node. Default value is false. + - ``xspace``: The amount of horizontal space to leave between + subtrees when managing this widget. Default value is 10. + - ``yspace``: The amount of space to place between the node and + its children when managing this widget. Default value is 15. + - ``color``: The color of the lines connecting the node to its + subtrees; and of the outline of the triangular roof. Default + value is ``'#006060'``. + - ``fill``: The fill color for the triangular roof. Default + value is ``''`` (no fill). + - ``width``: The width of the lines connecting the node to its + subtrees; and of the outline of the triangular roof. Default + value is 1. + - ``orientation``: Determines whether the tree branches downwards + or rightwards. Possible values are ``'horizontal'`` and + ``'vertical'``. The default value is ``'vertical'`` (i.e., + branch downwards). + - ``draggable``: whether the widget can be dragged by the user. + """ + + def __init__(self, canvas, label, subtrees, **attribs): + """ + :type node: + :type subtrees: list(CanvasWidgetI) + """ + self._label = label + self._subtrees = subtrees + + # Attributes + self._horizontal = 0 + self._roof = 0 + self._xspace = 10 + self._yspace = 15 + self._ordered = False + + # Create canvas objects. + self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees] + self._polygon = canvas.create_polygon( + 0, 0, fill="", state="hidden", outline="#006060" + ) + + # Register child widgets (label + subtrees) + self._add_child_widget(label) + for subtree in subtrees: + self._add_child_widget(subtree) + + # Are we currently managing? + self._managing = False + + CanvasWidget.__init__(self, canvas, **attribs) + + def __setitem__(self, attr, value): + canvas = self.canvas() + if attr == "roof": + self._roof = value + if self._roof: + for l in self._lines: + canvas.itemconfig(l, state="hidden") + canvas.itemconfig(self._polygon, state="normal") + else: + for l in self._lines: + canvas.itemconfig(l, state="normal") + canvas.itemconfig(self._polygon, state="hidden") + elif attr == "orientation": + if value == "horizontal": + self._horizontal = 1 + elif value == "vertical": + self._horizontal = 0 + else: + raise ValueError("orientation must be horizontal or vertical") + elif attr == "color": + for l in self._lines: + canvas.itemconfig(l, fill=value) + canvas.itemconfig(self._polygon, outline=value) + elif isinstance(attr, tuple) and attr[0] == "color": + # Set the color of an individual line. + l = self._lines[int(attr[1])] + canvas.itemconfig(l, fill=value) + elif attr == "fill": + canvas.itemconfig(self._polygon, fill=value) + elif attr == "width": + canvas.itemconfig(self._polygon, {attr: value}) + for l in self._lines: + canvas.itemconfig(l, {attr: value}) + elif attr in ("xspace", "yspace"): + if attr == "xspace": + self._xspace = value + elif attr == "yspace": + self._yspace = value + self.update(self._label) + elif attr == "ordered": + self._ordered = value + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "roof": + return self._roof + elif attr == "width": + return self.canvas().itemcget(self._polygon, attr) + elif attr == "color": + return self.canvas().itemcget(self._polygon, "outline") + elif isinstance(attr, tuple) and attr[0] == "color": + l = self._lines[int(attr[1])] + return self.canvas().itemcget(l, "fill") + elif attr == "xspace": + return self._xspace + elif attr == "yspace": + return self._yspace + elif attr == "orientation": + if self._horizontal: + return "horizontal" + else: + return "vertical" + elif attr == "ordered": + return self._ordered + else: + return CanvasWidget.__getitem__(self, attr) + + def label(self): + return self._label + + def subtrees(self): + return self._subtrees[:] + + def set_label(self, label): + """ + Set the node label to ``label``. + """ + self._remove_child_widget(self._label) + self._add_child_widget(label) + self._label = label + self.update(self._label) + + def replace_child(self, oldchild, newchild): + """ + Replace the child ``oldchild`` with ``newchild``. + """ + index = self._subtrees.index(oldchild) + self._subtrees[index] = newchild + self._remove_child_widget(oldchild) + self._add_child_widget(newchild) + self.update(newchild) + + def remove_child(self, child): + index = self._subtrees.index(child) + del self._subtrees[index] + self._remove_child_widget(child) + self.canvas().delete(self._lines.pop()) + self.update(self._label) + + def insert_child(self, index, child): + canvas = self.canvas() + self._subtrees.insert(index, child) + self._add_child_widget(child) + self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060")) + self.update(self._label) + + # but.. lines??? + + def _tags(self): + if self._roof: + return [self._polygon] + else: + return self._lines + + def _subtree_top(self, child): + if isinstance(child, TreeSegmentWidget): + bbox = child.label().bbox() + else: + bbox = child.bbox() + if self._horizontal: + return (bbox[0], (bbox[1] + bbox[3]) / 2.0) + else: + return ((bbox[0] + bbox[2]) / 2.0, bbox[1]) + + def _node_bottom(self): + bbox = self._label.bbox() + if self._horizontal: + return (bbox[2], (bbox[1] + bbox[3]) / 2.0) + else: + return ((bbox[0] + bbox[2]) / 2.0, bbox[3]) + + def _update(self, child): + if len(self._subtrees) == 0: + return + if self._label.bbox() is None: + return # [XX] ??? + + # Which lines need to be redrawn? + if child is self._label: + need_update = self._subtrees + else: + need_update = [child] + + if self._ordered and not self._managing: + need_update = self._maintain_order(child) + + # Update the polygon. + (nodex, nodey) = self._node_bottom() + (xmin, ymin, xmax, ymax) = self._subtrees[0].bbox() + for subtree in self._subtrees[1:]: + bbox = subtree.bbox() + xmin = min(xmin, bbox[0]) + ymin = min(ymin, bbox[1]) + xmax = max(xmax, bbox[2]) + ymax = max(ymax, bbox[3]) + + if self._horizontal: + self.canvas().coords( + self._polygon, nodex, nodey, xmin, ymin, xmin, ymax, nodex, nodey + ) + else: + self.canvas().coords( + self._polygon, nodex, nodey, xmin, ymin, xmax, ymin, nodex, nodey + ) + + # Redraw all lines that need it. + for subtree in need_update: + (nodex, nodey) = self._node_bottom() + line = self._lines[self._subtrees.index(subtree)] + (subtreex, subtreey) = self._subtree_top(subtree) + self.canvas().coords(line, nodex, nodey, subtreex, subtreey) + + def _maintain_order(self, child): + if self._horizontal: + return self._maintain_order_horizontal(child) + else: + return self._maintain_order_vertical(child) + + def _maintain_order_vertical(self, child): + (left, top, right, bot) = child.bbox() + + if child is self._label: + # Check all the leaves + for subtree in self._subtrees: + (x1, y1, x2, y2) = subtree.bbox() + if bot + self._yspace > y1: + subtree.move(0, bot + self._yspace - y1) + + return self._subtrees + else: + moved = [child] + index = self._subtrees.index(child) + + # Check leaves to our right. + x = right + self._xspace + for i in range(index + 1, len(self._subtrees)): + (x1, y1, x2, y2) = self._subtrees[i].bbox() + if x > x1: + self._subtrees[i].move(x - x1, 0) + x += x2 - x1 + self._xspace + moved.append(self._subtrees[i]) + + # Check leaves to our left. + x = left - self._xspace + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._subtrees[i].bbox() + if x < x2: + self._subtrees[i].move(x - x2, 0) + x -= x2 - x1 + self._xspace + moved.append(self._subtrees[i]) + + # Check the node + (x1, y1, x2, y2) = self._label.bbox() + if y2 > top - self._yspace: + self._label.move(0, top - self._yspace - y2) + moved = self._subtrees + + # Return a list of the nodes we moved + return moved + + def _maintain_order_horizontal(self, child): + (left, top, right, bot) = child.bbox() + + if child is self._label: + # Check all the leaves + for subtree in self._subtrees: + (x1, y1, x2, y2) = subtree.bbox() + if right + self._xspace > x1: + subtree.move(right + self._xspace - x1) + + return self._subtrees + else: + moved = [child] + index = self._subtrees.index(child) + + # Check leaves below us. + y = bot + self._yspace + for i in range(index + 1, len(self._subtrees)): + (x1, y1, x2, y2) = self._subtrees[i].bbox() + if y > y1: + self._subtrees[i].move(0, y - y1) + y += y2 - y1 + self._yspace + moved.append(self._subtrees[i]) + + # Check leaves above us + y = top - self._yspace + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._subtrees[i].bbox() + if y < y2: + self._subtrees[i].move(0, y - y2) + y -= y2 - y1 + self._yspace + moved.append(self._subtrees[i]) + + # Check the node + (x1, y1, x2, y2) = self._label.bbox() + if x2 > left - self._xspace: + self._label.move(left - self._xspace - x2, 0) + moved = self._subtrees + + # Return a list of the nodes we moved + return moved + + def _manage_horizontal(self): + (nodex, nodey) = self._node_bottom() + + # Put the subtrees in a line. + y = 20 + for subtree in self._subtrees: + subtree_bbox = subtree.bbox() + dx = nodex - subtree_bbox[0] + self._xspace + dy = y - subtree_bbox[1] + subtree.move(dx, dy) + y += subtree_bbox[3] - subtree_bbox[1] + self._yspace + + # Find the center of their tops. + center = 0.0 + for subtree in self._subtrees: + center += self._subtree_top(subtree)[1] + center /= len(self._subtrees) + + # Center the subtrees with the node. + for subtree in self._subtrees: + subtree.move(0, nodey - center) + + def _manage_vertical(self): + (nodex, nodey) = self._node_bottom() + + # Put the subtrees in a line. + x = 0 + for subtree in self._subtrees: + subtree_bbox = subtree.bbox() + dy = nodey - subtree_bbox[1] + self._yspace + dx = x - subtree_bbox[0] + subtree.move(dx, dy) + x += subtree_bbox[2] - subtree_bbox[0] + self._xspace + + # Find the center of their tops. + center = 0.0 + for subtree in self._subtrees: + center += self._subtree_top(subtree)[0] / len(self._subtrees) + + # Center the subtrees with the node. + for subtree in self._subtrees: + subtree.move(nodex - center, 0) + + def _manage(self): + self._managing = True + (nodex, nodey) = self._node_bottom() + if len(self._subtrees) == 0: + return + + if self._horizontal: + self._manage_horizontal() + else: + self._manage_vertical() + + # Update lines to subtrees. + for subtree in self._subtrees: + self._update(subtree) + + self._managing = False + + def __repr__(self): + return f"[TreeSeg {self._label}: {self._subtrees}]" + + +def _tree_to_treeseg( + canvas, + t, + make_node, + make_leaf, + tree_attribs, + node_attribs, + leaf_attribs, + loc_attribs, +): + if isinstance(t, Tree): + label = make_node(canvas, t.label(), **node_attribs) + subtrees = [ + _tree_to_treeseg( + canvas, + child, + make_node, + make_leaf, + tree_attribs, + node_attribs, + leaf_attribs, + loc_attribs, + ) + for child in t + ] + return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs) + else: + return make_leaf(canvas, t, **leaf_attribs) + + +def tree_to_treesegment( + canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs +): + """ + Convert a Tree into a ``TreeSegmentWidget``. + + :param make_node: A ``CanvasWidget`` constructor or a function that + creates ``CanvasWidgets``. ``make_node`` is used to convert + the Tree's nodes into ``CanvasWidgets``. If no constructor + is specified, then ``TextWidget`` will be used. + :param make_leaf: A ``CanvasWidget`` constructor or a function that + creates ``CanvasWidgets``. ``make_leaf`` is used to convert + the Tree's leafs into ``CanvasWidgets``. If no constructor + is specified, then ``TextWidget`` will be used. + :param attribs: Attributes for the canvas widgets that make up the + returned ``TreeSegmentWidget``. Any attribute beginning with + ``'tree_'`` will be passed to all ``TreeSegmentWidgets`` (with + the ``'tree_'`` prefix removed. Any attribute beginning with + ``'node_'`` will be passed to all nodes. Any attribute + beginning with ``'leaf_'`` will be passed to all leaves. And + any attribute beginning with ``'loc_'`` will be passed to all + text locations (for Trees). + """ + # Process attribs. + tree_attribs = {} + node_attribs = {} + leaf_attribs = {} + loc_attribs = {} + + for (key, value) in list(attribs.items()): + if key[:5] == "tree_": + tree_attribs[key[5:]] = value + elif key[:5] == "node_": + node_attribs[key[5:]] = value + elif key[:5] == "leaf_": + leaf_attribs[key[5:]] = value + elif key[:4] == "loc_": + loc_attribs[key[4:]] = value + else: + raise ValueError("Bad attribute: %s" % key) + return _tree_to_treeseg( + canvas, + t, + make_node, + make_leaf, + tree_attribs, + node_attribs, + leaf_attribs, + loc_attribs, + ) + + +##////////////////////////////////////////////////////// +## Tree Widget +##////////////////////////////////////////////////////// + + +class TreeWidget(CanvasWidget): + """ + A canvas widget that displays a single Tree. + ``TreeWidget`` manages a group of ``TreeSegmentWidgets`` that are + used to display a Tree. + + Attributes: + + - ``node_attr``: Sets the attribute ``attr`` on all of the + node widgets for this ``TreeWidget``. + - ``node_attr``: Sets the attribute ``attr`` on all of the + leaf widgets for this ``TreeWidget``. + - ``loc_attr``: Sets the attribute ``attr`` on all of the + location widgets for this ``TreeWidget`` (if it was built from + a Tree). Note that a location widget is a ``TextWidget``. + + - ``xspace``: The amount of horizontal space to leave between + subtrees when managing this widget. Default value is 10. + - ``yspace``: The amount of space to place between the node and + its children when managing this widget. Default value is 15. + + - ``line_color``: The color of the lines connecting each expanded + node to its subtrees. + - ``roof_color``: The color of the outline of the triangular roof + for collapsed trees. + - ``roof_fill``: The fill color for the triangular roof for + collapsed trees. + - ``width`` + + - ``orientation``: Determines whether the tree branches downwards + or rightwards. Possible values are ``'horizontal'`` and + ``'vertical'``. The default value is ``'vertical'`` (i.e., + branch downwards). + + - ``shapeable``: whether the subtrees can be independently + dragged by the user. THIS property simply sets the + ``DRAGGABLE`` property on all of the ``TreeWidget``'s tree + segments. + - ``draggable``: whether the widget can be dragged by the user. + """ + + def __init__( + self, canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs + ): + # Node & leaf canvas widget constructors + self._make_node = make_node + self._make_leaf = make_leaf + self._tree = t + + # Attributes. + self._nodeattribs = {} + self._leafattribs = {} + self._locattribs = {"color": "#008000"} + self._line_color = "#008080" + self._line_width = 1 + self._roof_color = "#008080" + self._roof_fill = "#c0c0c0" + self._shapeable = False + self._xspace = 10 + self._yspace = 10 + self._orientation = "vertical" + self._ordered = False + + # Build trees. + self._keys = {} # treeseg -> key + self._expanded_trees = {} + self._collapsed_trees = {} + self._nodes = [] + self._leaves = [] + # self._locs = [] + self._make_collapsed_trees(canvas, t, ()) + self._treeseg = self._make_expanded_tree(canvas, t, ()) + self._add_child_widget(self._treeseg) + + CanvasWidget.__init__(self, canvas, **attribs) + + def expanded_tree(self, *path_to_tree): + """ + Return the ``TreeSegmentWidget`` for the specified subtree. + + :param path_to_tree: A list of indices i1, i2, ..., in, where + the desired widget is the widget corresponding to + ``tree.children()[i1].children()[i2]....children()[in]``. + For the root, the path is ``()``. + """ + return self._expanded_trees[path_to_tree] + + def collapsed_tree(self, *path_to_tree): + """ + Return the ``TreeSegmentWidget`` for the specified subtree. + + :param path_to_tree: A list of indices i1, i2, ..., in, where + the desired widget is the widget corresponding to + ``tree.children()[i1].children()[i2]....children()[in]``. + For the root, the path is ``()``. + """ + return self._collapsed_trees[path_to_tree] + + def bind_click_trees(self, callback, button=1): + """ + Add a binding to all tree segments. + """ + for tseg in list(self._expanded_trees.values()): + tseg.bind_click(callback, button) + for tseg in list(self._collapsed_trees.values()): + tseg.bind_click(callback, button) + + def bind_drag_trees(self, callback, button=1): + """ + Add a binding to all tree segments. + """ + for tseg in list(self._expanded_trees.values()): + tseg.bind_drag(callback, button) + for tseg in list(self._collapsed_trees.values()): + tseg.bind_drag(callback, button) + + def bind_click_leaves(self, callback, button=1): + """ + Add a binding to all leaves. + """ + for leaf in self._leaves: + leaf.bind_click(callback, button) + for leaf in self._leaves: + leaf.bind_click(callback, button) + + def bind_drag_leaves(self, callback, button=1): + """ + Add a binding to all leaves. + """ + for leaf in self._leaves: + leaf.bind_drag(callback, button) + for leaf in self._leaves: + leaf.bind_drag(callback, button) + + def bind_click_nodes(self, callback, button=1): + """ + Add a binding to all nodes. + """ + for node in self._nodes: + node.bind_click(callback, button) + for node in self._nodes: + node.bind_click(callback, button) + + def bind_drag_nodes(self, callback, button=1): + """ + Add a binding to all nodes. + """ + for node in self._nodes: + node.bind_drag(callback, button) + for node in self._nodes: + node.bind_drag(callback, button) + + def _make_collapsed_trees(self, canvas, t, key): + if not isinstance(t, Tree): + return + make_node = self._make_node + make_leaf = self._make_leaf + + node = make_node(canvas, t.label(), **self._nodeattribs) + self._nodes.append(node) + leaves = [make_leaf(canvas, l, **self._leafattribs) for l in t.leaves()] + self._leaves += leaves + treeseg = TreeSegmentWidget( + canvas, + node, + leaves, + roof=1, + color=self._roof_color, + fill=self._roof_fill, + width=self._line_width, + ) + + self._collapsed_trees[key] = treeseg + self._keys[treeseg] = key + # self._add_child_widget(treeseg) + treeseg.hide() + + # Build trees for children. + for i in range(len(t)): + child = t[i] + self._make_collapsed_trees(canvas, child, key + (i,)) + + def _make_expanded_tree(self, canvas, t, key): + make_node = self._make_node + make_leaf = self._make_leaf + + if isinstance(t, Tree): + node = make_node(canvas, t.label(), **self._nodeattribs) + self._nodes.append(node) + children = t + subtrees = [ + self._make_expanded_tree(canvas, children[i], key + (i,)) + for i in range(len(children)) + ] + treeseg = TreeSegmentWidget( + canvas, node, subtrees, color=self._line_color, width=self._line_width + ) + self._expanded_trees[key] = treeseg + self._keys[treeseg] = key + return treeseg + else: + leaf = make_leaf(canvas, t, **self._leafattribs) + self._leaves.append(leaf) + return leaf + + def __setitem__(self, attr, value): + if attr[:5] == "node_": + for node in self._nodes: + node[attr[5:]] = value + elif attr[:5] == "leaf_": + for leaf in self._leaves: + leaf[attr[5:]] = value + elif attr == "line_color": + self._line_color = value + for tseg in list(self._expanded_trees.values()): + tseg["color"] = value + elif attr == "line_width": + self._line_width = value + for tseg in list(self._expanded_trees.values()): + tseg["width"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["width"] = value + elif attr == "roof_color": + self._roof_color = value + for tseg in list(self._collapsed_trees.values()): + tseg["color"] = value + elif attr == "roof_fill": + self._roof_fill = value + for tseg in list(self._collapsed_trees.values()): + tseg["fill"] = value + elif attr == "shapeable": + self._shapeable = value + for tseg in list(self._expanded_trees.values()): + tseg["draggable"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["draggable"] = value + for leaf in self._leaves: + leaf["draggable"] = value + elif attr == "xspace": + self._xspace = value + for tseg in list(self._expanded_trees.values()): + tseg["xspace"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["xspace"] = value + self.manage() + elif attr == "yspace": + self._yspace = value + for tseg in list(self._expanded_trees.values()): + tseg["yspace"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["yspace"] = value + self.manage() + elif attr == "orientation": + self._orientation = value + for tseg in list(self._expanded_trees.values()): + tseg["orientation"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["orientation"] = value + self.manage() + elif attr == "ordered": + self._ordered = value + for tseg in list(self._expanded_trees.values()): + tseg["ordered"] = value + for tseg in list(self._collapsed_trees.values()): + tseg["ordered"] = value + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr[:5] == "node_": + return self._nodeattribs.get(attr[5:], None) + elif attr[:5] == "leaf_": + return self._leafattribs.get(attr[5:], None) + elif attr[:4] == "loc_": + return self._locattribs.get(attr[4:], None) + elif attr == "line_color": + return self._line_color + elif attr == "line_width": + return self._line_width + elif attr == "roof_color": + return self._roof_color + elif attr == "roof_fill": + return self._roof_fill + elif attr == "shapeable": + return self._shapeable + elif attr == "xspace": + return self._xspace + elif attr == "yspace": + return self._yspace + elif attr == "orientation": + return self._orientation + else: + return CanvasWidget.__getitem__(self, attr) + + def _tags(self): + return [] + + def _manage(self): + segs = list(self._expanded_trees.values()) + list( + self._collapsed_trees.values() + ) + for tseg in segs: + if tseg.hidden(): + tseg.show() + tseg.manage() + tseg.hide() + + def toggle_collapsed(self, treeseg): + """ + Collapse/expand a tree. + """ + old_treeseg = treeseg + if old_treeseg["roof"]: + new_treeseg = self._expanded_trees[self._keys[old_treeseg]] + else: + new_treeseg = self._collapsed_trees[self._keys[old_treeseg]] + + # Replace the old tree with the new tree. + if old_treeseg.parent() is self: + self._remove_child_widget(old_treeseg) + self._add_child_widget(new_treeseg) + self._treeseg = new_treeseg + else: + old_treeseg.parent().replace_child(old_treeseg, new_treeseg) + + # Move the new tree to where the old tree was. Show it first, + # so we can find its bounding box. + new_treeseg.show() + (newx, newy) = new_treeseg.label().bbox()[:2] + (oldx, oldy) = old_treeseg.label().bbox()[:2] + new_treeseg.move(oldx - newx, oldy - newy) + + # Hide the old tree + old_treeseg.hide() + + # We could do parent.manage() here instead, if we wanted. + new_treeseg.parent().update(new_treeseg) + + +##////////////////////////////////////////////////////// +## draw_trees +##////////////////////////////////////////////////////// + + +class TreeView: + def __init__(self, *trees): + from math import ceil, sqrt + + self._trees = trees + + self._top = Tk() + self._top.title("NLTK") + self._top.bind("", self.destroy) + self._top.bind("", self.destroy) + + cf = self._cframe = CanvasFrame(self._top) + self._top.bind("", self._cframe.print_to_file) + + # Size is variable. + self._size = IntVar(self._top) + self._size.set(12) + bold = ("helvetica", -self._size.get(), "bold") + helv = ("helvetica", -self._size.get()) + + # Lay the trees out in a square. + self._width = int(ceil(sqrt(len(trees)))) + self._widgets = [] + for i in range(len(trees)): + widget = TreeWidget( + cf.canvas(), + trees[i], + node_font=bold, + leaf_color="#008040", + node_color="#004080", + roof_color="#004040", + roof_fill="white", + line_color="#004040", + draggable=1, + leaf_font=helv, + ) + widget.bind_click_trees(widget.toggle_collapsed) + self._widgets.append(widget) + cf.add_widget(widget, 0, 0) + + self._layout() + self._cframe.pack(expand=1, fill="both") + self._init_menubar() + + def _layout(self): + i = x = y = ymax = 0 + width = self._width + for i in range(len(self._widgets)): + widget = self._widgets[i] + (oldx, oldy) = widget.bbox()[:2] + if i % width == 0: + y = ymax + x = 0 + widget.move(x - oldx, y - oldy) + x = widget.bbox()[2] + 10 + ymax = max(ymax, widget.bbox()[3] + 10) + + def _init_menubar(self): + menubar = Menu(self._top) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Print to Postscript", + underline=0, + command=self._cframe.print_to_file, + accelerator="Ctrl-p", + ) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + zoommenu = Menu(menubar, tearoff=0) + zoommenu.add_radiobutton( + label="Tiny", + variable=self._size, + underline=0, + value=10, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Small", + variable=self._size, + underline=0, + value=12, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Medium", + variable=self._size, + underline=0, + value=14, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Large", + variable=self._size, + underline=0, + value=28, + command=self.resize, + ) + zoommenu.add_radiobutton( + label="Huge", + variable=self._size, + underline=0, + value=50, + command=self.resize, + ) + menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) + + self._top.config(menu=menubar) + + def resize(self, *e): + bold = ("helvetica", -self._size.get(), "bold") + helv = ("helvetica", -self._size.get()) + xspace = self._size.get() + yspace = self._size.get() + for widget in self._widgets: + widget["node_font"] = bold + widget["leaf_font"] = helv + widget["xspace"] = xspace + widget["yspace"] = yspace + if self._size.get() < 20: + widget["line_width"] = 1 + elif self._size.get() < 30: + widget["line_width"] = 2 + else: + widget["line_width"] = 3 + self._layout() + + def destroy(self, *e): + if self._top is None: + return + self._top.destroy() + self._top = None + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this demo is created from a non-interactive program (e.g. + from a secript); otherwise, the demo will close as soon as + the script completes. + """ + if in_idle(): + return + self._top.mainloop(*args, **kwargs) + + +def draw_trees(*trees): + """ + Open a new window containing a graphical diagram of the given + trees. + + :rtype: None + """ + TreeView(*trees).mainloop() + return + + +##////////////////////////////////////////////////////// +## Demo Code +##////////////////////////////////////////////////////// + + +def demo(): + import random + + def fill(cw): + cw["fill"] = "#%06d" % random.randint(0, 999999) + + cf = CanvasFrame(width=550, height=450, closeenough=2) + + t = Tree.fromstring( + """ + (S (NP the very big cat) + (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))""" + ) + + tc = TreeWidget( + cf.canvas(), + t, + draggable=1, + node_font=("helvetica", -14, "bold"), + leaf_font=("helvetica", -12, "italic"), + roof_fill="white", + roof_color="black", + leaf_color="green4", + node_color="blue2", + ) + cf.add_widget(tc, 10, 10) + + def boxit(canvas, text): + big = ("helvetica", -16, "bold") + return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green") + + def ovalit(canvas, text): + return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan") + + treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))") + tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) + + def color(node): + node["color"] = "#%04d00" % random.randint(0, 9999) + + def color2(treeseg): + treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999) + treeseg.label().child()["color"] = "white" + + tc.bind_click_trees(tc.toggle_collapsed) + tc2.bind_click_trees(tc2.toggle_collapsed) + tc.bind_click_nodes(color, 3) + tc2.expanded_tree(1).bind_click(color2, 3) + tc2.expanded_tree().bind_click(color2, 3) + + paren = ParenWidget(cf.canvas(), tc2) + cf.add_widget(paren, tc.bbox()[2] + 10, 10) + + tree3 = Tree.fromstring( + """ + (S (NP this tree) (AUX was) + (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))""" + ) + tc3 = tree_to_treesegment( + cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2 + ) + tc3["draggable"] = 1 + cf.add_widget(tc3, 10, tc.bbox()[3] + 10) + + def orientswitch(treewidget): + if treewidget["orientation"] == "horizontal": + treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical") + treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical") + treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical") + treewidget.collapsed_tree().subtrees()[3].set_text("vertical") + treewidget["orientation"] = "vertical" + else: + treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal") + treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal") + treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal") + treewidget.collapsed_tree().subtrees()[3].set_text("horizontal") + treewidget["orientation"] = "horizontal" + + text = """ +Try clicking, right clicking, and dragging +different elements of each of the trees. +The top-left tree is a TreeWidget built from +a Tree. The top-right is a TreeWidget built +from a Tree, using non-default widget +constructors for the nodes & leaves (BoxWidget +and OvalWidget). The bottom-left tree is +built from tree_to_treesegment.""" + twidget = TextWidget(cf.canvas(), text.strip()) + textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1) + cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) + + tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))") + tc4 = TreeWidget( + cf.canvas(), + tree4, + draggable=1, + line_color="brown2", + roof_color="brown2", + node_font=("helvetica", -12, "bold"), + node_color="brown4", + orientation="horizontal", + ) + tc4.manage() + cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) + tc4.bind_click(orientswitch) + tc4.bind_click_trees(tc4.toggle_collapsed, 3) + + # Run mainloop + cf.mainloop() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/draw/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/util.py new file mode 100644 index 0000000000000000000000000000000000000000..8193cc82b453a5221b5118a414171dc841921f50 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/draw/util.py @@ -0,0 +1,2575 @@ +# Natural Language Toolkit: Drawing utilities +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +Tools for graphically displaying and interacting with the objects and +processing classes defined by the Toolkit. These tools are primarily +intended to help students visualize the objects that they create. + +The graphical tools are typically built using "canvas widgets", each +of which encapsulates the graphical elements and bindings used to +display a complex object on a Tkinter ``Canvas``. For example, NLTK +defines canvas widgets for displaying trees and directed graphs, as +well as a number of simpler widgets. These canvas widgets make it +easier to build new graphical tools and demos. See the class +documentation for ``CanvasWidget`` for more information. + +The ``nltk.draw`` module defines the abstract ``CanvasWidget`` base +class, and a number of simple canvas widgets. The remaining canvas +widgets are defined by submodules, such as ``nltk.draw.tree``. + +The ``nltk.draw`` module also defines ``CanvasFrame``, which +encapsulates a ``Canvas`` and its scrollbars. It uses a +``ScrollWatcherWidget`` to ensure that all canvas widgets contained on +its canvas are within the scroll region. + +Acknowledgements: Many of the ideas behind the canvas widget system +are derived from ``CLIG``, a Tk-based grapher for linguistic data +structures. For more information, see the CLIG +homepage (http://www.ags.uni-sb.de/~konrad/clig.html). + +""" +from abc import ABCMeta, abstractmethod +from tkinter import ( + RAISED, + Button, + Canvas, + Entry, + Frame, + Label, + Menu, + Menubutton, + Scrollbar, + StringVar, + Text, + Tk, + Toplevel, + Widget, +) +from tkinter.filedialog import asksaveasfilename + +from nltk.util import in_idle + +##////////////////////////////////////////////////////// +## CanvasWidget +##////////////////////////////////////////////////////// + + +class CanvasWidget(metaclass=ABCMeta): + """ + A collection of graphical elements and bindings used to display a + complex object on a Tkinter ``Canvas``. A canvas widget is + responsible for managing the ``Canvas`` tags and callback bindings + necessary to display and interact with the object. Canvas widgets + are often organized into hierarchies, where parent canvas widgets + control aspects of their child widgets. + + Each canvas widget is bound to a single ``Canvas``. This ``Canvas`` + is specified as the first argument to the ``CanvasWidget``'s + constructor. + + Attributes. Each canvas widget can support a variety of + "attributes", which control how the canvas widget is displayed. + Some typical examples attributes are ``color``, ``font``, and + ``radius``. Each attribute has a default value. This default + value can be overridden in the constructor, using keyword + arguments of the form ``attribute=value``: + + >>> from nltk.draw.util import TextWidget + >>> cn = TextWidget(Canvas(), 'test', color='red') # doctest: +SKIP + + Attribute values can also be changed after a canvas widget has + been constructed, using the ``__setitem__`` operator: + + >>> cn['font'] = 'times' # doctest: +SKIP + + The current value of an attribute value can be queried using the + ``__getitem__`` operator: + + >>> cn['color'] # doctest: +SKIP + 'red' + + For a list of the attributes supported by a type of canvas widget, + see its class documentation. + + Interaction. The attribute ``'draggable'`` controls whether the + user can drag a canvas widget around the canvas. By default, + canvas widgets are not draggable. + + ``CanvasWidget`` provides callback support for two types of user + interaction: clicking and dragging. The method ``bind_click`` + registers a callback function that is called whenever the canvas + widget is clicked. The method ``bind_drag`` registers a callback + function that is called after the canvas widget is dragged. If + the user clicks or drags a canvas widget with no registered + callback function, then the interaction event will propagate to + its parent. For each canvas widget, only one callback function + may be registered for an interaction event. Callback functions + can be deregistered with the ``unbind_click`` and ``unbind_drag`` + methods. + + Subclassing. ``CanvasWidget`` is an abstract class. Subclasses + are required to implement the following methods: + + - ``__init__``: Builds a new canvas widget. It must perform the + following three tasks (in order): + + - Create any new graphical elements. + - Call ``_add_child_widget`` on each child widget. + - Call the ``CanvasWidget`` constructor. + - ``_tags``: Returns a list of the canvas tags for all graphical + elements managed by this canvas widget, not including + graphical elements managed by its child widgets. + - ``_manage``: Arranges the child widgets of this canvas widget. + This is typically only called when the canvas widget is + created. + - ``_update``: Update this canvas widget in response to a + change in a single child. + + For a ``CanvasWidget`` with no child widgets, the default + definitions for ``_manage`` and ``_update`` may be used. + + If a subclass defines any attributes, then it should implement + ``__getitem__`` and ``__setitem__``. If either of these methods is + called with an unknown attribute, then they should propagate the + request to ``CanvasWidget``. + + Most subclasses implement a number of additional methods that + modify the ``CanvasWidget`` in some way. These methods must call + ``parent.update(self)`` after making any changes to the canvas + widget's graphical elements. The canvas widget must also call + ``parent.update(self)`` after changing any attribute value that + affects the shape or position of the canvas widget's graphical + elements. + + :type __canvas: Tkinter.Canvas + :ivar __canvas: This ``CanvasWidget``'s canvas. + + :type __parent: CanvasWidget or None + :ivar __parent: This ``CanvasWidget``'s hierarchical parent widget. + :type __children: list(CanvasWidget) + :ivar __children: This ``CanvasWidget``'s hierarchical child widgets. + + :type __updating: bool + :ivar __updating: Is this canvas widget currently performing an + update? If it is, then it will ignore any new update requests + from child widgets. + + :type __draggable: bool + :ivar __draggable: Is this canvas widget draggable? + :type __press: event + :ivar __press: The ButtonPress event that we're currently handling. + :type __drag_x: int + :ivar __drag_x: Where it's been moved to (to find dx) + :type __drag_y: int + :ivar __drag_y: Where it's been moved to (to find dy) + :type __callbacks: dictionary + :ivar __callbacks: Registered callbacks. Currently, four keys are + used: ``1``, ``2``, ``3``, and ``'drag'``. The values are + callback functions. Each callback function takes a single + argument, which is the ``CanvasWidget`` that triggered the + callback. + """ + + def __init__(self, canvas, parent=None, **attribs): + """ + Create a new canvas widget. This constructor should only be + called by subclass constructors; and it should be called only + "after" the subclass has constructed all graphical canvas + objects and registered all child widgets. + + :param canvas: This canvas widget's canvas. + :type canvas: Tkinter.Canvas + :param parent: This canvas widget's hierarchical parent. + :type parent: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + if self.__class__ == CanvasWidget: + raise TypeError("CanvasWidget is an abstract base class") + + if not isinstance(canvas, Canvas): + raise TypeError("Expected a canvas!") + + self.__canvas = canvas + self.__parent = parent + + # If the subclass constructor called _add_child_widget, then + # self.__children will already exist. + if not hasattr(self, "_CanvasWidget__children"): + self.__children = [] + + # Is this widget hidden? + self.__hidden = 0 + + # Update control (prevents infinite loops) + self.__updating = 0 + + # Button-press and drag callback handling. + self.__press = None + self.__drag_x = self.__drag_y = 0 + self.__callbacks = {} + self.__draggable = 0 + + # Set up attributes. + for (attr, value) in list(attribs.items()): + self[attr] = value + + # Manage this canvas widget + self._manage() + + # Register any new bindings + for tag in self._tags(): + self.__canvas.tag_bind(tag, "", self.__press_cb) + self.__canvas.tag_bind(tag, "", self.__press_cb) + self.__canvas.tag_bind(tag, "", self.__press_cb) + + ##////////////////////////////////////////////////////// + ## Inherited methods. + ##////////////////////////////////////////////////////// + + def bbox(self): + """ + :return: A bounding box for this ``CanvasWidget``. The bounding + box is a tuple of four coordinates, *(xmin, ymin, xmax, ymax)*, + for a rectangle which encloses all of the canvas + widget's graphical elements. Bounding box coordinates are + specified with respect to the coordinate space of the ``Canvas``. + :rtype: tuple(int, int, int, int) + """ + if self.__hidden: + return (0, 0, 0, 0) + if len(self.tags()) == 0: + raise ValueError("No tags") + return self.__canvas.bbox(*self.tags()) + + def width(self): + """ + :return: The width of this canvas widget's bounding box, in + its ``Canvas``'s coordinate space. + :rtype: int + """ + if len(self.tags()) == 0: + raise ValueError("No tags") + bbox = self.__canvas.bbox(*self.tags()) + return bbox[2] - bbox[0] + + def height(self): + """ + :return: The height of this canvas widget's bounding box, in + its ``Canvas``'s coordinate space. + :rtype: int + """ + if len(self.tags()) == 0: + raise ValueError("No tags") + bbox = self.__canvas.bbox(*self.tags()) + return bbox[3] - bbox[1] + + def parent(self): + """ + :return: The hierarchical parent of this canvas widget. + ``self`` is considered a subpart of its parent for + purposes of user interaction. + :rtype: CanvasWidget or None + """ + return self.__parent + + def child_widgets(self): + """ + :return: A list of the hierarchical children of this canvas + widget. These children are considered part of ``self`` + for purposes of user interaction. + :rtype: list of CanvasWidget + """ + return self.__children + + def canvas(self): + """ + :return: The canvas that this canvas widget is bound to. + :rtype: Tkinter.Canvas + """ + return self.__canvas + + def move(self, dx, dy): + """ + Move this canvas widget by a given distance. In particular, + shift the canvas widget right by ``dx`` pixels, and down by + ``dy`` pixels. Both ``dx`` and ``dy`` may be negative, resulting + in leftward or upward movement. + + :type dx: int + :param dx: The number of pixels to move this canvas widget + rightwards. + :type dy: int + :param dy: The number of pixels to move this canvas widget + downwards. + :rtype: None + """ + if dx == dy == 0: + return + for tag in self.tags(): + self.__canvas.move(tag, dx, dy) + if self.__parent: + self.__parent.update(self) + + def moveto(self, x, y, anchor="NW"): + """ + Move this canvas widget to the given location. In particular, + shift the canvas widget such that the corner or side of the + bounding box specified by ``anchor`` is at location (``x``, + ``y``). + + :param x,y: The location that the canvas widget should be moved + to. + :param anchor: The corner or side of the canvas widget that + should be moved to the specified location. ``'N'`` + specifies the top center; ``'NE'`` specifies the top right + corner; etc. + """ + x1, y1, x2, y2 = self.bbox() + if anchor == "NW": + self.move(x - x1, y - y1) + if anchor == "N": + self.move(x - x1 / 2 - x2 / 2, y - y1) + if anchor == "NE": + self.move(x - x2, y - y1) + if anchor == "E": + self.move(x - x2, y - y1 / 2 - y2 / 2) + if anchor == "SE": + self.move(x - x2, y - y2) + if anchor == "S": + self.move(x - x1 / 2 - x2 / 2, y - y2) + if anchor == "SW": + self.move(x - x1, y - y2) + if anchor == "W": + self.move(x - x1, y - y1 / 2 - y2 / 2) + + def destroy(self): + """ + Remove this ``CanvasWidget`` from its ``Canvas``. After a + ``CanvasWidget`` has been destroyed, it should not be accessed. + + Note that you only need to destroy a top-level + ``CanvasWidget``; its child widgets will be destroyed + automatically. If you destroy a non-top-level + ``CanvasWidget``, then the entire top-level widget will be + destroyed. + + :raise ValueError: if this ``CanvasWidget`` has a parent. + :rtype: None + """ + if self.__parent is not None: + self.__parent.destroy() + return + + for tag in self.tags(): + self.__canvas.tag_unbind(tag, "") + self.__canvas.tag_unbind(tag, "") + self.__canvas.tag_unbind(tag, "") + self.__canvas.delete(*self.tags()) + self.__canvas = None + + def update(self, child): + """ + Update the graphical display of this canvas widget, and all of + its ancestors, in response to a change in one of this canvas + widget's children. + + :param child: The child widget that changed. + :type child: CanvasWidget + """ + if self.__hidden or child.__hidden: + return + # If we're already updating, then do nothing. This prevents + # infinite loops when _update modifies its children. + if self.__updating: + return + self.__updating = 1 + + # Update this CanvasWidget. + self._update(child) + + # Propagate update request to the parent. + if self.__parent: + self.__parent.update(self) + + # We're done updating. + self.__updating = 0 + + def manage(self): + """ + Arrange this canvas widget and all of its descendants. + + :rtype: None + """ + if self.__hidden: + return + for child in self.__children: + child.manage() + self._manage() + + def tags(self): + """ + :return: a list of the canvas tags for all graphical + elements managed by this canvas widget, including + graphical elements managed by its child widgets. + :rtype: list of int + """ + if self.__canvas is None: + raise ValueError("Attempt to access a destroyed canvas widget") + tags = [] + tags += self._tags() + for child in self.__children: + tags += child.tags() + return tags + + def __setitem__(self, attr, value): + """ + Set the value of the attribute ``attr`` to ``value``. See the + class documentation for a list of attributes supported by this + canvas widget. + + :rtype: None + """ + if attr == "draggable": + self.__draggable = value + else: + raise ValueError("Unknown attribute %r" % attr) + + def __getitem__(self, attr): + """ + :return: the value of the attribute ``attr``. See the class + documentation for a list of attributes supported by this + canvas widget. + :rtype: (any) + """ + if attr == "draggable": + return self.__draggable + else: + raise ValueError("Unknown attribute %r" % attr) + + def __repr__(self): + """ + :return: a string representation of this canvas widget. + :rtype: str + """ + return "<%s>" % self.__class__.__name__ + + def hide(self): + """ + Temporarily hide this canvas widget. + + :rtype: None + """ + self.__hidden = 1 + for tag in self.tags(): + self.__canvas.itemconfig(tag, state="hidden") + + def show(self): + """ + Show a hidden canvas widget. + + :rtype: None + """ + self.__hidden = 0 + for tag in self.tags(): + self.__canvas.itemconfig(tag, state="normal") + + def hidden(self): + """ + :return: True if this canvas widget is hidden. + :rtype: bool + """ + return self.__hidden + + ##////////////////////////////////////////////////////// + ## Callback interface + ##////////////////////////////////////////////////////// + + def bind_click(self, callback, button=1): + """ + Register a new callback that will be called whenever this + ``CanvasWidget`` is clicked on. + + :type callback: function + :param callback: The callback function that will be called + whenever this ``CanvasWidget`` is clicked. This function + will be called with this ``CanvasWidget`` as its argument. + :type button: int + :param button: Which button the user should use to click on + this ``CanvasWidget``. Typically, this should be 1 (left + button), 3 (right button), or 2 (middle button). + """ + self.__callbacks[button] = callback + + def bind_drag(self, callback): + """ + Register a new callback that will be called after this + ``CanvasWidget`` is dragged. This implicitly makes this + ``CanvasWidget`` draggable. + + :type callback: function + :param callback: The callback function that will be called + whenever this ``CanvasWidget`` is clicked. This function + will be called with this ``CanvasWidget`` as its argument. + """ + self.__draggable = 1 + self.__callbacks["drag"] = callback + + def unbind_click(self, button=1): + """ + Remove a callback that was registered with ``bind_click``. + + :type button: int + :param button: Which button the user should use to click on + this ``CanvasWidget``. Typically, this should be 1 (left + button), 3 (right button), or 2 (middle button). + """ + try: + del self.__callbacks[button] + except: + pass + + def unbind_drag(self): + """ + Remove a callback that was registered with ``bind_drag``. + """ + try: + del self.__callbacks["drag"] + except: + pass + + ##////////////////////////////////////////////////////// + ## Callback internals + ##////////////////////////////////////////////////////// + + def __press_cb(self, event): + """ + Handle a button-press event: + - record the button press event in ``self.__press`` + - register a button-release callback. + - if this CanvasWidget or any of its ancestors are + draggable, then register the appropriate motion callback. + """ + # If we're already waiting for a button release, then ignore + # this new button press. + if ( + self.__canvas.bind("") + or self.__canvas.bind("") + or self.__canvas.bind("") + ): + return + + # Unbind motion (just in case; this shouldn't be necessary) + self.__canvas.unbind("") + + # Record the button press event. + self.__press = event + + # If any ancestor is draggable, set up a motion callback. + # (Only if they pressed button number 1) + if event.num == 1: + widget = self + while widget is not None: + if widget["draggable"]: + widget.__start_drag(event) + break + widget = widget.parent() + + # Set up the button release callback. + self.__canvas.bind("" % event.num, self.__release_cb) + + def __start_drag(self, event): + """ + Begin dragging this object: + - register a motion callback + - record the drag coordinates + """ + self.__canvas.bind("", self.__motion_cb) + self.__drag_x = event.x + self.__drag_y = event.y + + def __motion_cb(self, event): + """ + Handle a motion event: + - move this object to the new location + - record the new drag coordinates + """ + self.move(event.x - self.__drag_x, event.y - self.__drag_y) + self.__drag_x = event.x + self.__drag_y = event.y + + def __release_cb(self, event): + """ + Handle a release callback: + - unregister motion & button release callbacks. + - decide whether they clicked, dragged, or cancelled + - call the appropriate handler. + """ + # Unbind the button release & motion callbacks. + self.__canvas.unbind("" % event.num) + self.__canvas.unbind("") + + # Is it a click or a drag? + if ( + event.time - self.__press.time < 100 + and abs(event.x - self.__press.x) + abs(event.y - self.__press.y) < 5 + ): + # Move it back, if we were dragging. + if self.__draggable and event.num == 1: + self.move( + self.__press.x - self.__drag_x, self.__press.y - self.__drag_y + ) + self.__click(event.num) + elif event.num == 1: + self.__drag() + + self.__press = None + + def __drag(self): + """ + If this ``CanvasWidget`` has a drag callback, then call it; + otherwise, find the closest ancestor with a drag callback, and + call it. If no ancestors have a drag callback, do nothing. + """ + if self.__draggable: + if "drag" in self.__callbacks: + cb = self.__callbacks["drag"] + try: + cb(self) + except: + print("Error in drag callback for %r" % self) + elif self.__parent is not None: + self.__parent.__drag() + + def __click(self, button): + """ + If this ``CanvasWidget`` has a drag callback, then call it; + otherwise, find the closest ancestor with a click callback, and + call it. If no ancestors have a click callback, do nothing. + """ + if button in self.__callbacks: + cb = self.__callbacks[button] + # try: + cb(self) + # except: + # print('Error in click callback for %r' % self) + # raise + elif self.__parent is not None: + self.__parent.__click(button) + + ##////////////////////////////////////////////////////// + ## Child/parent Handling + ##////////////////////////////////////////////////////// + + def _add_child_widget(self, child): + """ + Register a hierarchical child widget. The child will be + considered part of this canvas widget for purposes of user + interaction. ``_add_child_widget`` has two direct effects: + - It sets ``child``'s parent to this canvas widget. + - It adds ``child`` to the list of canvas widgets returned by + the ``child_widgets`` member function. + + :param child: The new child widget. ``child`` must not already + have a parent. + :type child: CanvasWidget + """ + if not hasattr(self, "_CanvasWidget__children"): + self.__children = [] + if child.__parent is not None: + raise ValueError(f"{child} already has a parent") + child.__parent = self + self.__children.append(child) + + def _remove_child_widget(self, child): + """ + Remove a hierarchical child widget. This child will no longer + be considered part of this canvas widget for purposes of user + interaction. ``_add_child_widget`` has two direct effects: + - It sets ``child``'s parent to None. + - It removes ``child`` from the list of canvas widgets + returned by the ``child_widgets`` member function. + + :param child: The child widget to remove. ``child`` must be a + child of this canvas widget. + :type child: CanvasWidget + """ + self.__children.remove(child) + child.__parent = None + + ##////////////////////////////////////////////////////// + ## Defined by subclass + ##////////////////////////////////////////////////////// + + @abstractmethod + def _tags(self): + """ + :return: a list of canvas tags for all graphical elements + managed by this canvas widget, not including graphical + elements managed by its child widgets. + :rtype: list of int + """ + + def _manage(self): + """ + Arrange the child widgets of this canvas widget. This method + is called when the canvas widget is initially created. It is + also called if the user calls the ``manage`` method on this + canvas widget or any of its ancestors. + + :rtype: None + """ + + def _update(self, child): + """ + Update this canvas widget in response to a change in one of + its children. + + :param child: The child that changed. + :type child: CanvasWidget + :rtype: None + """ + + +##////////////////////////////////////////////////////// +## Basic widgets. +##////////////////////////////////////////////////////// + + +class TextWidget(CanvasWidget): + """ + A canvas widget that displays a single string of text. + + Attributes: + - ``color``: the color of the text. + - ``font``: the font used to display the text. + - ``justify``: justification for multi-line texts. Valid values + are ``left``, ``center``, and ``right``. + - ``width``: the width of the text. If the text is wider than + this width, it will be line-wrapped at whitespace. + - ``draggable``: whether the text can be dragged by the user. + """ + + def __init__(self, canvas, text, **attribs): + """ + Create a new text widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :type text: str + :param text: The string of text to display. + :param attribs: The new canvas widget's attributes. + """ + self._text = text + self._tag = canvas.create_text(1, 1, text=text) + CanvasWidget.__init__(self, canvas, **attribs) + + def __setitem__(self, attr, value): + if attr in ("color", "font", "justify", "width"): + if attr == "color": + attr = "fill" + self.canvas().itemconfig(self._tag, {attr: value}) + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "width": + return int(self.canvas().itemcget(self._tag, attr)) + elif attr in ("color", "font", "justify"): + if attr == "color": + attr = "fill" + return self.canvas().itemcget(self._tag, attr) + else: + return CanvasWidget.__getitem__(self, attr) + + def _tags(self): + return [self._tag] + + def text(self): + """ + :return: The text displayed by this text widget. + :rtype: str + """ + return self.canvas().itemcget(self._tag, "TEXT") + + def set_text(self, text): + """ + Change the text that is displayed by this text widget. + + :type text: str + :param text: The string of text to display. + :rtype: None + """ + self.canvas().itemconfig(self._tag, text=text) + if self.parent() is not None: + self.parent().update(self) + + def __repr__(self): + return "[Text: %r]" % self._text + + +class SymbolWidget(TextWidget): + """ + A canvas widget that displays special symbols, such as the + negation sign and the exists operator. Symbols are specified by + name. Currently, the following symbol names are defined: ``neg``, + ``disj``, ``conj``, ``lambda``, ``merge``, ``forall``, ``exists``, + ``subseteq``, ``subset``, ``notsubset``, ``emptyset``, ``imp``, + ``rightarrow``, ``equal``, ``notequal``, ``epsilon``. + + Attributes: + + - ``color``: the color of the text. + - ``draggable``: whether the text can be dragged by the user. + + :cvar SYMBOLS: A dictionary mapping from symbols to the character + in the ``symbol`` font used to render them. + """ + + SYMBOLS = { + "neg": "\330", + "disj": "\332", + "conj": "\331", + "lambda": "\154", + "merge": "\304", + "forall": "\042", + "exists": "\044", + "subseteq": "\315", + "subset": "\314", + "notsubset": "\313", + "emptyset": "\306", + "imp": "\336", + "rightarrow": chr(222), #'\256', + "equal": "\75", + "notequal": "\271", + "intersection": "\307", + "union": "\310", + "epsilon": "e", + } + + def __init__(self, canvas, symbol, **attribs): + """ + Create a new symbol widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :type symbol: str + :param symbol: The name of the symbol to display. + :param attribs: The new canvas widget's attributes. + """ + attribs["font"] = "symbol" + TextWidget.__init__(self, canvas, "", **attribs) + self.set_symbol(symbol) + + def symbol(self): + """ + :return: the name of the symbol that is displayed by this + symbol widget. + :rtype: str + """ + return self._symbol + + def set_symbol(self, symbol): + """ + Change the symbol that is displayed by this symbol widget. + + :type symbol: str + :param symbol: The name of the symbol to display. + """ + if symbol not in SymbolWidget.SYMBOLS: + raise ValueError("Unknown symbol: %s" % symbol) + self._symbol = symbol + self.set_text(SymbolWidget.SYMBOLS[symbol]) + + def __repr__(self): + return "[Symbol: %r]" % self._symbol + + @staticmethod + def symbolsheet(size=20): + """ + Open a new Tkinter window that displays the entire alphabet + for the symbol font. This is useful for constructing the + ``SymbolWidget.SYMBOLS`` dictionary. + """ + top = Tk() + + def destroy(e, top=top): + top.destroy() + + top.bind("q", destroy) + Button(top, text="Quit", command=top.destroy).pack(side="bottom") + text = Text(top, font=("helvetica", -size), width=20, height=30) + text.pack(side="left") + sb = Scrollbar(top, command=text.yview) + text["yscrollcommand"] = sb.set + sb.pack(side="right", fill="y") + text.tag_config("symbol", font=("symbol", -size)) + for i in range(256): + if i in (0, 10): + continue # null and newline + for k, v in list(SymbolWidget.SYMBOLS.items()): + if v == chr(i): + text.insert("end", "%-10s\t" % k) + break + else: + text.insert("end", "%-10d \t" % i) + text.insert("end", "[%s]\n" % chr(i), "symbol") + top.mainloop() + + +class AbstractContainerWidget(CanvasWidget): + """ + An abstract class for canvas widgets that contain a single child, + such as ``BoxWidget`` and ``OvalWidget``. Subclasses must define + a constructor, which should create any new graphical elements and + then call the ``AbstractCanvasContainer`` constructor. Subclasses + must also define the ``_update`` method and the ``_tags`` method; + and any subclasses that define attributes should define + ``__setitem__`` and ``__getitem__``. + """ + + def __init__(self, canvas, child, **attribs): + """ + Create a new container widget. This constructor should only + be called by subclass constructors. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param child: The container's child widget. ``child`` must not + have a parent. + :type child: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + self._child = child + self._add_child_widget(child) + CanvasWidget.__init__(self, canvas, **attribs) + + def _manage(self): + self._update(self._child) + + def child(self): + """ + :return: The child widget contained by this container widget. + :rtype: CanvasWidget + """ + return self._child + + def set_child(self, child): + """ + Change the child widget contained by this container widget. + + :param child: The new child widget. ``child`` must not have a + parent. + :type child: CanvasWidget + :rtype: None + """ + self._remove_child_widget(self._child) + self._add_child_widget(child) + self._child = child + self.update(child) + + def __repr__(self): + name = self.__class__.__name__ + if name[-6:] == "Widget": + name = name[:-6] + return f"[{name}: {self._child!r}]" + + +class BoxWidget(AbstractContainerWidget): + """ + A canvas widget that places a box around a child widget. + + Attributes: + - ``fill``: The color used to fill the interior of the box. + - ``outline``: The color used to draw the outline of the box. + - ``width``: The width of the outline of the box. + - ``margin``: The number of pixels space left between the child + and the box. + - ``draggable``: whether the text can be dragged by the user. + """ + + def __init__(self, canvas, child, **attribs): + """ + Create a new box widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param child: The child widget. ``child`` must not have a + parent. + :type child: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + self._child = child + self._margin = 1 + self._box = canvas.create_rectangle(1, 1, 1, 1) + canvas.tag_lower(self._box) + AbstractContainerWidget.__init__(self, canvas, child, **attribs) + + def __setitem__(self, attr, value): + if attr == "margin": + self._margin = value + elif attr in ("outline", "fill", "width"): + self.canvas().itemconfig(self._box, {attr: value}) + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "margin": + return self._margin + elif attr == "width": + return float(self.canvas().itemcget(self._box, attr)) + elif attr in ("outline", "fill", "width"): + return self.canvas().itemcget(self._box, attr) + else: + return CanvasWidget.__getitem__(self, attr) + + def _update(self, child): + (x1, y1, x2, y2) = child.bbox() + margin = self._margin + self["width"] / 2 + self.canvas().coords( + self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin + ) + + def _tags(self): + return [self._box] + + +class OvalWidget(AbstractContainerWidget): + """ + A canvas widget that places a oval around a child widget. + + Attributes: + - ``fill``: The color used to fill the interior of the oval. + - ``outline``: The color used to draw the outline of the oval. + - ``width``: The width of the outline of the oval. + - ``margin``: The number of pixels space left between the child + and the oval. + - ``draggable``: whether the text can be dragged by the user. + - ``double``: If true, then a double-oval is drawn. + """ + + def __init__(self, canvas, child, **attribs): + """ + Create a new oval widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param child: The child widget. ``child`` must not have a + parent. + :type child: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + self._child = child + self._margin = 1 + self._oval = canvas.create_oval(1, 1, 1, 1) + self._circle = attribs.pop("circle", False) + self._double = attribs.pop("double", False) + if self._double: + self._oval2 = canvas.create_oval(1, 1, 1, 1) + else: + self._oval2 = None + canvas.tag_lower(self._oval) + AbstractContainerWidget.__init__(self, canvas, child, **attribs) + + def __setitem__(self, attr, value): + c = self.canvas() + if attr == "margin": + self._margin = value + elif attr == "double": + if value == True and self._oval2 is None: + # Copy attributes & position from self._oval. + x1, y1, x2, y2 = c.bbox(self._oval) + w = self["width"] * 2 + self._oval2 = c.create_oval( + x1 - w, + y1 - w, + x2 + w, + y2 + w, + outline=c.itemcget(self._oval, "outline"), + width=c.itemcget(self._oval, "width"), + ) + c.tag_lower(self._oval2) + if value == False and self._oval2 is not None: + c.delete(self._oval2) + self._oval2 = None + elif attr in ("outline", "fill", "width"): + c.itemconfig(self._oval, {attr: value}) + if self._oval2 is not None and attr != "fill": + c.itemconfig(self._oval2, {attr: value}) + if self._oval2 is not None and attr != "fill": + self.canvas().itemconfig(self._oval2, {attr: value}) + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "margin": + return self._margin + elif attr == "double": + return self._double is not None + elif attr == "width": + return float(self.canvas().itemcget(self._oval, attr)) + elif attr in ("outline", "fill", "width"): + return self.canvas().itemcget(self._oval, attr) + else: + return CanvasWidget.__getitem__(self, attr) + + # The ratio between inscribed & circumscribed ovals + RATIO = 1.4142135623730949 + + def _update(self, child): + R = OvalWidget.RATIO + (x1, y1, x2, y2) = child.bbox() + margin = self._margin + + # If we're a circle, pretend our contents are square. + if self._circle: + dx, dy = abs(x1 - x2), abs(y1 - y2) + if dx > dy: + y = (y1 + y2) / 2 + y1, y2 = y - dx / 2, y + dx / 2 + elif dy > dx: + x = (x1 + x2) / 2 + x1, x2 = x - dy / 2, x + dy / 2 + + # Find the four corners. + left = int((x1 * (1 + R) + x2 * (1 - R)) / 2) + right = left + int((x2 - x1) * R) + top = int((y1 * (1 + R) + y2 * (1 - R)) / 2) + bot = top + int((y2 - y1) * R) + self.canvas().coords( + self._oval, left - margin, top - margin, right + margin, bot + margin + ) + if self._oval2 is not None: + self.canvas().coords( + self._oval2, + left - margin + 2, + top - margin + 2, + right + margin - 2, + bot + margin - 2, + ) + + def _tags(self): + if self._oval2 is None: + return [self._oval] + else: + return [self._oval, self._oval2] + + +class ParenWidget(AbstractContainerWidget): + """ + A canvas widget that places a pair of parenthases around a child + widget. + + Attributes: + - ``color``: The color used to draw the parenthases. + - ``width``: The width of the parenthases. + - ``draggable``: whether the text can be dragged by the user. + """ + + def __init__(self, canvas, child, **attribs): + """ + Create a new parenthasis widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param child: The child widget. ``child`` must not have a + parent. + :type child: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + self._child = child + self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180) + self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180) + AbstractContainerWidget.__init__(self, canvas, child, **attribs) + + def __setitem__(self, attr, value): + if attr == "color": + self.canvas().itemconfig(self._oparen, outline=value) + self.canvas().itemconfig(self._cparen, outline=value) + elif attr == "width": + self.canvas().itemconfig(self._oparen, width=value) + self.canvas().itemconfig(self._cparen, width=value) + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "color": + return self.canvas().itemcget(self._oparen, "outline") + elif attr == "width": + return self.canvas().itemcget(self._oparen, "width") + else: + return CanvasWidget.__getitem__(self, attr) + + def _update(self, child): + (x1, y1, x2, y2) = child.bbox() + width = max((y2 - y1) / 6, 4) + self.canvas().coords(self._oparen, x1 - width, y1, x1 + width, y2) + self.canvas().coords(self._cparen, x2 - width, y1, x2 + width, y2) + + def _tags(self): + return [self._oparen, self._cparen] + + +class BracketWidget(AbstractContainerWidget): + """ + A canvas widget that places a pair of brackets around a child + widget. + + Attributes: + - ``color``: The color used to draw the brackets. + - ``width``: The width of the brackets. + - ``draggable``: whether the text can be dragged by the user. + """ + + def __init__(self, canvas, child, **attribs): + """ + Create a new bracket widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param child: The child widget. ``child`` must not have a + parent. + :type child: CanvasWidget + :param attribs: The new canvas widget's attributes. + """ + self._child = child + self._obrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) + self._cbrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) + AbstractContainerWidget.__init__(self, canvas, child, **attribs) + + def __setitem__(self, attr, value): + if attr == "color": + self.canvas().itemconfig(self._obrack, fill=value) + self.canvas().itemconfig(self._cbrack, fill=value) + elif attr == "width": + self.canvas().itemconfig(self._obrack, width=value) + self.canvas().itemconfig(self._cbrack, width=value) + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "color": + return self.canvas().itemcget(self._obrack, "outline") + elif attr == "width": + return self.canvas().itemcget(self._obrack, "width") + else: + return CanvasWidget.__getitem__(self, attr) + + def _update(self, child): + (x1, y1, x2, y2) = child.bbox() + width = max((y2 - y1) / 8, 2) + self.canvas().coords( + self._obrack, x1, y1, x1 - width, y1, x1 - width, y2, x1, y2 + ) + self.canvas().coords( + self._cbrack, x2, y1, x2 + width, y1, x2 + width, y2, x2, y2 + ) + + def _tags(self): + return [self._obrack, self._cbrack] + + +class SequenceWidget(CanvasWidget): + """ + A canvas widget that keeps a list of canvas widgets in a + horizontal line. + + Attributes: + - ``align``: The vertical alignment of the children. Possible + values are ``'top'``, ``'center'``, and ``'bottom'``. By + default, children are center-aligned. + - ``space``: The amount of horizontal space to place between + children. By default, one pixel of space is used. + - ``ordered``: If true, then keep the children in their + original order. + """ + + def __init__(self, canvas, *children, **attribs): + """ + Create a new sequence widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param children: The widgets that should be aligned + horizontally. Each child must not have a parent. + :type children: list(CanvasWidget) + :param attribs: The new canvas widget's attributes. + """ + self._align = "center" + self._space = 1 + self._ordered = False + self._children = list(children) + for child in children: + self._add_child_widget(child) + CanvasWidget.__init__(self, canvas, **attribs) + + def __setitem__(self, attr, value): + if attr == "align": + if value not in ("top", "bottom", "center"): + raise ValueError("Bad alignment: %r" % value) + self._align = value + elif attr == "space": + self._space = value + elif attr == "ordered": + self._ordered = value + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "align": + return self._align + elif attr == "space": + return self._space + elif attr == "ordered": + return self._ordered + else: + return CanvasWidget.__getitem__(self, attr) + + def _tags(self): + return [] + + def _yalign(self, top, bot): + if self._align == "top": + return top + if self._align == "bottom": + return bot + if self._align == "center": + return (top + bot) / 2 + + def _update(self, child): + # Align all children with child. + (left, top, right, bot) = child.bbox() + y = self._yalign(top, bot) + for c in self._children: + (x1, y1, x2, y2) = c.bbox() + c.move(0, y - self._yalign(y1, y2)) + + if self._ordered and len(self._children) > 1: + index = self._children.index(child) + + x = right + self._space + for i in range(index + 1, len(self._children)): + (x1, y1, x2, y2) = self._children[i].bbox() + if x > x1: + self._children[i].move(x - x1, 0) + x += x2 - x1 + self._space + + x = left - self._space + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._children[i].bbox() + if x < x2: + self._children[i].move(x - x2, 0) + x -= x2 - x1 + self._space + + def _manage(self): + if len(self._children) == 0: + return + child = self._children[0] + + # Align all children with child. + (left, top, right, bot) = child.bbox() + y = self._yalign(top, bot) + + index = self._children.index(child) + + # Line up children to the right of child. + x = right + self._space + for i in range(index + 1, len(self._children)): + (x1, y1, x2, y2) = self._children[i].bbox() + self._children[i].move(x - x1, y - self._yalign(y1, y2)) + x += x2 - x1 + self._space + + # Line up children to the left of child. + x = left - self._space + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._children[i].bbox() + self._children[i].move(x - x2, y - self._yalign(y1, y2)) + x -= x2 - x1 + self._space + + def __repr__(self): + return "[Sequence: " + repr(self._children)[1:-1] + "]" + + # Provide an alias for the child_widgets() member. + children = CanvasWidget.child_widgets + + def replace_child(self, oldchild, newchild): + """ + Replace the child canvas widget ``oldchild`` with ``newchild``. + ``newchild`` must not have a parent. ``oldchild``'s parent will + be set to None. + + :type oldchild: CanvasWidget + :param oldchild: The child canvas widget to remove. + :type newchild: CanvasWidget + :param newchild: The canvas widget that should replace + ``oldchild``. + """ + index = self._children.index(oldchild) + self._children[index] = newchild + self._remove_child_widget(oldchild) + self._add_child_widget(newchild) + self.update(newchild) + + def remove_child(self, child): + """ + Remove the given child canvas widget. ``child``'s parent will + be set to None. + + :type child: CanvasWidget + :param child: The child canvas widget to remove. + """ + index = self._children.index(child) + del self._children[index] + self._remove_child_widget(child) + if len(self._children) > 0: + self.update(self._children[0]) + + def insert_child(self, index, child): + """ + Insert a child canvas widget before a given index. + + :type child: CanvasWidget + :param child: The canvas widget that should be inserted. + :type index: int + :param index: The index where the child widget should be + inserted. In particular, the index of ``child`` will be + ``index``; and the index of any children whose indices were + greater than equal to ``index`` before ``child`` was + inserted will be incremented by one. + """ + self._children.insert(index, child) + self._add_child_widget(child) + + +class StackWidget(CanvasWidget): + """ + A canvas widget that keeps a list of canvas widgets in a vertical + line. + + Attributes: + - ``align``: The horizontal alignment of the children. Possible + values are ``'left'``, ``'center'``, and ``'right'``. By + default, children are center-aligned. + - ``space``: The amount of vertical space to place between + children. By default, one pixel of space is used. + - ``ordered``: If true, then keep the children in their + original order. + """ + + def __init__(self, canvas, *children, **attribs): + """ + Create a new stack widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :param children: The widgets that should be aligned + vertically. Each child must not have a parent. + :type children: list(CanvasWidget) + :param attribs: The new canvas widget's attributes. + """ + self._align = "center" + self._space = 1 + self._ordered = False + self._children = list(children) + for child in children: + self._add_child_widget(child) + CanvasWidget.__init__(self, canvas, **attribs) + + def __setitem__(self, attr, value): + if attr == "align": + if value not in ("left", "right", "center"): + raise ValueError("Bad alignment: %r" % value) + self._align = value + elif attr == "space": + self._space = value + elif attr == "ordered": + self._ordered = value + else: + CanvasWidget.__setitem__(self, attr, value) + + def __getitem__(self, attr): + if attr == "align": + return self._align + elif attr == "space": + return self._space + elif attr == "ordered": + return self._ordered + else: + return CanvasWidget.__getitem__(self, attr) + + def _tags(self): + return [] + + def _xalign(self, left, right): + if self._align == "left": + return left + if self._align == "right": + return right + if self._align == "center": + return (left + right) / 2 + + def _update(self, child): + # Align all children with child. + (left, top, right, bot) = child.bbox() + x = self._xalign(left, right) + for c in self._children: + (x1, y1, x2, y2) = c.bbox() + c.move(x - self._xalign(x1, x2), 0) + + if self._ordered and len(self._children) > 1: + index = self._children.index(child) + + y = bot + self._space + for i in range(index + 1, len(self._children)): + (x1, y1, x2, y2) = self._children[i].bbox() + if y > y1: + self._children[i].move(0, y - y1) + y += y2 - y1 + self._space + + y = top - self._space + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._children[i].bbox() + if y < y2: + self._children[i].move(0, y - y2) + y -= y2 - y1 + self._space + + def _manage(self): + if len(self._children) == 0: + return + child = self._children[0] + + # Align all children with child. + (left, top, right, bot) = child.bbox() + x = self._xalign(left, right) + + index = self._children.index(child) + + # Line up children below the child. + y = bot + self._space + for i in range(index + 1, len(self._children)): + (x1, y1, x2, y2) = self._children[i].bbox() + self._children[i].move(x - self._xalign(x1, x2), y - y1) + y += y2 - y1 + self._space + + # Line up children above the child. + y = top - self._space + for i in range(index - 1, -1, -1): + (x1, y1, x2, y2) = self._children[i].bbox() + self._children[i].move(x - self._xalign(x1, x2), y - y2) + y -= y2 - y1 + self._space + + def __repr__(self): + return "[Stack: " + repr(self._children)[1:-1] + "]" + + # Provide an alias for the child_widgets() member. + children = CanvasWidget.child_widgets + + def replace_child(self, oldchild, newchild): + """ + Replace the child canvas widget ``oldchild`` with ``newchild``. + ``newchild`` must not have a parent. ``oldchild``'s parent will + be set to None. + + :type oldchild: CanvasWidget + :param oldchild: The child canvas widget to remove. + :type newchild: CanvasWidget + :param newchild: The canvas widget that should replace + ``oldchild``. + """ + index = self._children.index(oldchild) + self._children[index] = newchild + self._remove_child_widget(oldchild) + self._add_child_widget(newchild) + self.update(newchild) + + def remove_child(self, child): + """ + Remove the given child canvas widget. ``child``'s parent will + be set to None. + + :type child: CanvasWidget + :param child: The child canvas widget to remove. + """ + index = self._children.index(child) + del self._children[index] + self._remove_child_widget(child) + if len(self._children) > 0: + self.update(self._children[0]) + + def insert_child(self, index, child): + """ + Insert a child canvas widget before a given index. + + :type child: CanvasWidget + :param child: The canvas widget that should be inserted. + :type index: int + :param index: The index where the child widget should be + inserted. In particular, the index of ``child`` will be + ``index``; and the index of any children whose indices were + greater than equal to ``index`` before ``child`` was + inserted will be incremented by one. + """ + self._children.insert(index, child) + self._add_child_widget(child) + + +class SpaceWidget(CanvasWidget): + """ + A canvas widget that takes up space but does not display + anything. A ``SpaceWidget`` can be used to add space between + elements. Each space widget is characterized by a width and a + height. If you wish to only create horizontal space, then use a + height of zero; and if you wish to only create vertical space, use + a width of zero. + """ + + def __init__(self, canvas, width, height, **attribs): + """ + Create a new space widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :type width: int + :param width: The width of the new space widget. + :type height: int + :param height: The height of the new space widget. + :param attribs: The new canvas widget's attributes. + """ + # For some reason, + if width > 4: + width -= 4 + if height > 4: + height -= 4 + self._tag = canvas.create_line(1, 1, width, height, fill="") + CanvasWidget.__init__(self, canvas, **attribs) + + # note: width() and height() are already defined by CanvasWidget. + def set_width(self, width): + """ + Change the width of this space widget. + + :param width: The new width. + :type width: int + :rtype: None + """ + [x1, y1, x2, y2] = self.bbox() + self.canvas().coords(self._tag, x1, y1, x1 + width, y2) + + def set_height(self, height): + """ + Change the height of this space widget. + + :param height: The new height. + :type height: int + :rtype: None + """ + [x1, y1, x2, y2] = self.bbox() + self.canvas().coords(self._tag, x1, y1, x2, y1 + height) + + def _tags(self): + return [self._tag] + + def __repr__(self): + return "[Space]" + + +class ScrollWatcherWidget(CanvasWidget): + """ + A special canvas widget that adjusts its ``Canvas``'s scrollregion + to always include the bounding boxes of all of its children. The + scroll-watcher widget will only increase the size of the + ``Canvas``'s scrollregion; it will never decrease it. + """ + + def __init__(self, canvas, *children, **attribs): + """ + Create a new scroll-watcher widget. + + :type canvas: Tkinter.Canvas + :param canvas: This canvas widget's canvas. + :type children: list(CanvasWidget) + :param children: The canvas widgets watched by the + scroll-watcher. The scroll-watcher will ensure that these + canvas widgets are always contained in their canvas's + scrollregion. + :param attribs: The new canvas widget's attributes. + """ + for child in children: + self._add_child_widget(child) + CanvasWidget.__init__(self, canvas, **attribs) + + def add_child(self, canvaswidget): + """ + Add a new canvas widget to the scroll-watcher. The + scroll-watcher will ensure that the new canvas widget is + always contained in its canvas's scrollregion. + + :param canvaswidget: The new canvas widget. + :type canvaswidget: CanvasWidget + :rtype: None + """ + self._add_child_widget(canvaswidget) + self.update(canvaswidget) + + def remove_child(self, canvaswidget): + """ + Remove a canvas widget from the scroll-watcher. The + scroll-watcher will no longer ensure that the new canvas + widget is always contained in its canvas's scrollregion. + + :param canvaswidget: The canvas widget to remove. + :type canvaswidget: CanvasWidget + :rtype: None + """ + self._remove_child_widget(canvaswidget) + + def _tags(self): + return [] + + def _update(self, child): + self._adjust_scrollregion() + + def _adjust_scrollregion(self): + """ + Adjust the scrollregion of this scroll-watcher's ``Canvas`` to + include the bounding boxes of all of its children. + """ + bbox = self.bbox() + canvas = self.canvas() + scrollregion = [int(n) for n in canvas["scrollregion"].split()] + if len(scrollregion) != 4: + return + if ( + bbox[0] < scrollregion[0] + or bbox[1] < scrollregion[1] + or bbox[2] > scrollregion[2] + or bbox[3] > scrollregion[3] + ): + scrollregion = "%d %d %d %d" % ( + min(bbox[0], scrollregion[0]), + min(bbox[1], scrollregion[1]), + max(bbox[2], scrollregion[2]), + max(bbox[3], scrollregion[3]), + ) + canvas["scrollregion"] = scrollregion + + +##////////////////////////////////////////////////////// +## Canvas Frame +##////////////////////////////////////////////////////// + + +class CanvasFrame: + """ + A ``Tkinter`` frame containing a canvas and scrollbars. + ``CanvasFrame`` uses a ``ScrollWatcherWidget`` to ensure that all of + the canvas widgets contained on its canvas are within its + scrollregion. In order for ``CanvasFrame`` to make these checks, + all canvas widgets must be registered with ``add_widget`` when they + are added to the canvas; and destroyed with ``destroy_widget`` when + they are no longer needed. + + If a ``CanvasFrame`` is created with no parent, then it will create + its own main window, including a "Done" button and a "Print" + button. + """ + + def __init__(self, parent=None, **kw): + """ + Create a new ``CanvasFrame``. + + :type parent: Tkinter.BaseWidget or Tkinter.Tk + :param parent: The parent ``Tkinter`` widget. If no parent is + specified, then ``CanvasFrame`` will create a new main + window. + :param kw: Keyword arguments for the new ``Canvas``. See the + documentation for ``Tkinter.Canvas`` for more information. + """ + # If no parent was given, set up a top-level window. + if parent is None: + self._parent = Tk() + self._parent.title("NLTK") + self._parent.bind("", lambda e: self.print_to_file()) + self._parent.bind("", self.destroy) + self._parent.bind("", self.destroy) + else: + self._parent = parent + + # Create a frame for the canvas & scrollbars + self._frame = frame = Frame(self._parent) + self._canvas = canvas = Canvas(frame, **kw) + xscrollbar = Scrollbar(self._frame, orient="horizontal") + yscrollbar = Scrollbar(self._frame, orient="vertical") + xscrollbar["command"] = canvas.xview + yscrollbar["command"] = canvas.yview + canvas["xscrollcommand"] = xscrollbar.set + canvas["yscrollcommand"] = yscrollbar.set + yscrollbar.pack(fill="y", side="right") + xscrollbar.pack(fill="x", side="bottom") + canvas.pack(expand=1, fill="both", side="left") + + # Set initial scroll region. + scrollregion = "0 0 {} {}".format(canvas["width"], canvas["height"]) + canvas["scrollregion"] = scrollregion + + self._scrollwatcher = ScrollWatcherWidget(canvas) + + # If no parent was given, pack the frame, and add a menu. + if parent is None: + self.pack(expand=1, fill="both") + self._init_menubar() + + def _init_menubar(self): + menubar = Menu(self._parent) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Print to Postscript", + underline=0, + command=self.print_to_file, + accelerator="Ctrl-p", + ) + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + self._parent.config(menu=menubar) + + def print_to_file(self, filename=None): + """ + Print the contents of this ``CanvasFrame`` to a postscript + file. If no filename is given, then prompt the user for one. + + :param filename: The name of the file to print the tree to. + :type filename: str + :rtype: None + """ + if filename is None: + ftypes = [("Postscript files", ".ps"), ("All files", "*")] + filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps") + if not filename: + return + (x0, y0, w, h) = self.scrollregion() + postscript = self._canvas.postscript( + x=x0, + y=y0, + width=w + 2, + height=h + 2, + pagewidth=w + 2, # points = 1/72 inch + pageheight=h + 2, # points = 1/72 inch + pagex=0, + pagey=0, + ) + # workaround for bug in Tk font handling + postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ") + with open(filename, "wb") as f: + f.write(postscript.encode("utf8")) + + def scrollregion(self): + """ + :return: The current scroll region for the canvas managed by + this ``CanvasFrame``. + :rtype: 4-tuple of int + """ + (x1, y1, x2, y2) = self._canvas["scrollregion"].split() + return (int(x1), int(y1), int(x2), int(y2)) + + def canvas(self): + """ + :return: The canvas managed by this ``CanvasFrame``. + :rtype: Tkinter.Canvas + """ + return self._canvas + + def add_widget(self, canvaswidget, x=None, y=None): + """ + Register a canvas widget with this ``CanvasFrame``. The + ``CanvasFrame`` will ensure that this canvas widget is always + within the ``Canvas``'s scrollregion. If no coordinates are + given for the canvas widget, then the ``CanvasFrame`` will + attempt to find a clear area of the canvas for it. + + :type canvaswidget: CanvasWidget + :param canvaswidget: The new canvas widget. ``canvaswidget`` + must have been created on this ``CanvasFrame``'s canvas. + :type x: int + :param x: The initial x coordinate for the upper left hand + corner of ``canvaswidget``, in the canvas's coordinate + space. + :type y: int + :param y: The initial y coordinate for the upper left hand + corner of ``canvaswidget``, in the canvas's coordinate + space. + """ + if x is None or y is None: + (x, y) = self._find_room(canvaswidget, x, y) + + # Move to (x,y) + (x1, y1, x2, y2) = canvaswidget.bbox() + canvaswidget.move(x - x1, y - y1) + + # Register with scrollwatcher. + self._scrollwatcher.add_child(canvaswidget) + + def _find_room(self, widget, desired_x, desired_y): + """ + Try to find a space for a given widget. + """ + (left, top, right, bot) = self.scrollregion() + w = widget.width() + h = widget.height() + + if w >= (right - left): + return (0, 0) + if h >= (bot - top): + return (0, 0) + + # Move the widget out of the way, for now. + (x1, y1, x2, y2) = widget.bbox() + widget.move(left - x2 - 50, top - y2 - 50) + + if desired_x is not None: + x = desired_x + for y in range(top, bot - h, int((bot - top - h) / 10)): + if not self._canvas.find_overlapping( + x - 5, y - 5, x + w + 5, y + h + 5 + ): + return (x, y) + + if desired_y is not None: + y = desired_y + for x in range(left, right - w, int((right - left - w) / 10)): + if not self._canvas.find_overlapping( + x - 5, y - 5, x + w + 5, y + h + 5 + ): + return (x, y) + + for y in range(top, bot - h, int((bot - top - h) / 10)): + for x in range(left, right - w, int((right - left - w) / 10)): + if not self._canvas.find_overlapping( + x - 5, y - 5, x + w + 5, y + h + 5 + ): + return (x, y) + return (0, 0) + + def destroy_widget(self, canvaswidget): + """ + Remove a canvas widget from this ``CanvasFrame``. This + deregisters the canvas widget, and destroys it. + """ + self.remove_widget(canvaswidget) + canvaswidget.destroy() + + def remove_widget(self, canvaswidget): + # Deregister with scrollwatcher. + self._scrollwatcher.remove_child(canvaswidget) + + def pack(self, cnf={}, **kw): + """ + Pack this ``CanvasFrame``. See the documentation for + ``Tkinter.Pack`` for more information. + """ + self._frame.pack(cnf, **kw) + # Adjust to be big enough for kids? + + def destroy(self, *e): + """ + Destroy this ``CanvasFrame``. If this ``CanvasFrame`` created a + top-level window, then this will close that window. + """ + if self._parent is None: + return + self._parent.destroy() + self._parent = None + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this frame is created from a non-interactive program (e.g. + from a secript); otherwise, the frame will close as soon as + the script completes. + """ + if in_idle(): + return + self._parent.mainloop(*args, **kwargs) + + +##////////////////////////////////////////////////////// +## Text display +##////////////////////////////////////////////////////// + + +class ShowText: + """ + A ``Tkinter`` window used to display a text. ``ShowText`` is + typically used by graphical tools to display help text, or similar + information. + """ + + def __init__(self, root, title, text, width=None, height=None, **textbox_options): + if width is None or height is None: + (width, height) = self.find_dimentions(text, width, height) + + # Create the main window. + if root is None: + self._top = top = Tk() + else: + self._top = top = Toplevel(root) + top.title(title) + + b = Button(top, text="Ok", command=self.destroy) + b.pack(side="bottom") + + tbf = Frame(top) + tbf.pack(expand=1, fill="both") + scrollbar = Scrollbar(tbf, orient="vertical") + scrollbar.pack(side="right", fill="y") + textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options) + textbox.insert("end", text) + textbox["state"] = "disabled" + textbox.pack(side="left", expand=1, fill="both") + scrollbar["command"] = textbox.yview + textbox["yscrollcommand"] = scrollbar.set + + # Make it easy to close the window. + top.bind("q", self.destroy) + top.bind("x", self.destroy) + top.bind("c", self.destroy) + top.bind("", self.destroy) + top.bind("", self.destroy) + + # Focus the scrollbar, so they can use up/down, etc. + scrollbar.focus() + + def find_dimentions(self, text, width, height): + lines = text.split("\n") + if width is None: + maxwidth = max(len(line) for line in lines) + width = min(maxwidth, 80) + + # Now, find height. + height = 0 + for line in lines: + while len(line) > width: + brk = line[:width].rfind(" ") + line = line[brk:] + height += 1 + height += 1 + height = min(height, 25) + + return (width, height) + + def destroy(self, *e): + if self._top is None: + return + self._top.destroy() + self._top = None + + def mainloop(self, *args, **kwargs): + """ + Enter the Tkinter mainloop. This function must be called if + this window is created from a non-interactive program (e.g. + from a secript); otherwise, the window will close as soon as + the script completes. + """ + if in_idle(): + return + self._top.mainloop(*args, **kwargs) + + +##////////////////////////////////////////////////////// +## Entry dialog +##////////////////////////////////////////////////////// + + +class EntryDialog: + """ + A dialog box for entering + """ + + def __init__( + self, parent, original_text="", instructions="", set_callback=None, title=None + ): + self._parent = parent + self._original_text = original_text + self._set_callback = set_callback + + width = int(max(30, len(original_text) * 3 / 2)) + self._top = Toplevel(parent) + + if title: + self._top.title(title) + + # The text entry box. + entryframe = Frame(self._top) + entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10) + if instructions: + l = Label(entryframe, text=instructions) + l.pack(side="top", anchor="w", padx=30) + self._entry = Entry(entryframe, width=width) + self._entry.pack(expand=1, fill="x", padx=30) + self._entry.insert(0, original_text) + + # A divider + divider = Frame(self._top, borderwidth=1, relief="sunken") + divider.pack(fill="x", ipady=1, padx=10) + + # The buttons. + buttons = Frame(self._top) + buttons.pack(expand=0, fill="x", padx=5, pady=5) + b = Button(buttons, text="Cancel", command=self._cancel, width=8) + b.pack(side="right", padx=5) + b = Button(buttons, text="Ok", command=self._ok, width=8, default="active") + b.pack(side="left", padx=5) + b = Button(buttons, text="Apply", command=self._apply, width=8) + b.pack(side="left") + + self._top.bind("", self._ok) + self._top.bind("", self._cancel) + self._top.bind("", self._cancel) + + self._entry.focus() + + def _reset(self, *e): + self._entry.delete(0, "end") + self._entry.insert(0, self._original_text) + if self._set_callback: + self._set_callback(self._original_text) + + def _cancel(self, *e): + try: + self._reset() + except: + pass + self._destroy() + + def _ok(self, *e): + self._apply() + self._destroy() + + def _apply(self, *e): + if self._set_callback: + self._set_callback(self._entry.get()) + + def _destroy(self, *e): + if self._top is None: + return + self._top.destroy() + self._top = None + + +##////////////////////////////////////////////////////// +## Colorized List +##////////////////////////////////////////////////////// + + +class ColorizedList: + """ + An abstract base class for displaying a colorized list of items. + Subclasses should define: + + - ``_init_colortags``, which sets up Text color tags that + will be used by the list. + - ``_item_repr``, which returns a list of (text,colortag) + tuples that make up the colorized representation of the + item. + + :note: Typically, you will want to register a callback for + ``'select'`` that calls ``mark`` on the given item. + """ + + def __init__(self, parent, items=[], **options): + """ + Construct a new list. + + :param parent: The Tk widget that contains the colorized list + :param items: The initial contents of the colorized list. + :param options: + """ + self._parent = parent + self._callbacks = {} + + # Which items are marked? + self._marks = {} + + # Initialize the Tkinter frames. + self._init_itemframe(options.copy()) + + # Set up key & mouse bindings. + self._textwidget.bind("", self._keypress) + self._textwidget.bind("", self._buttonpress) + + # Fill in the given CFG's items. + self._items = None + self.set(items) + + # //////////////////////////////////////////////////////////// + # Abstract methods + # //////////////////////////////////////////////////////////// + @abstractmethod + def _init_colortags(self, textwidget, options): + """ + Set up any colortags that will be used by this colorized list. + E.g.: + textwidget.tag_config('terminal', foreground='black') + """ + + @abstractmethod + def _item_repr(self, item): + """ + Return a list of (text, colortag) tuples that make up the + colorized representation of the item. Colorized + representations may not span multiple lines. I.e., the text + strings returned may not contain newline characters. + """ + + # //////////////////////////////////////////////////////////// + # Item Access + # //////////////////////////////////////////////////////////// + + def get(self, index=None): + """ + :return: A list of the items contained by this list. + """ + if index is None: + return self._items[:] + else: + return self._items[index] + + def set(self, items): + """ + Modify the list of items contained by this list. + """ + items = list(items) + if self._items == items: + return + self._items = list(items) + + self._textwidget["state"] = "normal" + self._textwidget.delete("1.0", "end") + for item in items: + for (text, colortag) in self._item_repr(item): + assert "\n" not in text, "item repr may not contain newline" + self._textwidget.insert("end", text, colortag) + self._textwidget.insert("end", "\n") + # Remove the final newline + self._textwidget.delete("end-1char", "end") + self._textwidget.mark_set("insert", "1.0") + self._textwidget["state"] = "disabled" + # Clear all marks + self._marks.clear() + + def unmark(self, item=None): + """ + Remove highlighting from the given item; or from every item, + if no item is given. + :raise ValueError: If ``item`` is not contained in the list. + :raise KeyError: If ``item`` is not marked. + """ + if item is None: + self._marks.clear() + self._textwidget.tag_remove("highlight", "1.0", "end+1char") + else: + index = self._items.index(item) + del self._marks[item] + (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) + self._textwidget.tag_remove("highlight", start, end) + + def mark(self, item): + """ + Highlight the given item. + :raise ValueError: If ``item`` is not contained in the list. + """ + self._marks[item] = 1 + index = self._items.index(item) + (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) + self._textwidget.tag_add("highlight", start, end) + + def markonly(self, item): + """ + Remove any current highlighting, and mark the given item. + :raise ValueError: If ``item`` is not contained in the list. + """ + self.unmark() + self.mark(item) + + def view(self, item): + """ + Adjust the view such that the given item is visible. If + the item is already visible, then do nothing. + """ + index = self._items.index(item) + self._textwidget.see("%d.0" % (index + 1)) + + # //////////////////////////////////////////////////////////// + # Callbacks + # //////////////////////////////////////////////////////////// + + def add_callback(self, event, func): + """ + Register a callback function with the list. This function + will be called whenever the given event occurs. + + :param event: The event that will trigger the callback + function. Valid events are: click1, click2, click3, + space, return, select, up, down, next, prior, move + :param func: The function that should be called when + the event occurs. ``func`` will be called with a + single item as its argument. (The item selected + or the item moved to). + """ + if event == "select": + events = ["click1", "space", "return"] + elif event == "move": + events = ["up", "down", "next", "prior"] + else: + events = [event] + + for e in events: + self._callbacks.setdefault(e, {})[func] = 1 + + def remove_callback(self, event, func=None): + """ + Deregister a callback function. If ``func`` is none, then + all callbacks are removed for the given event. + """ + if event is None: + events = list(self._callbacks.keys()) + elif event == "select": + events = ["click1", "space", "return"] + elif event == "move": + events = ["up", "down", "next", "prior"] + else: + events = [event] + + for e in events: + if func is None: + del self._callbacks[e] + else: + try: + del self._callbacks[e][func] + except: + pass + + # //////////////////////////////////////////////////////////// + # Tkinter Methods + # //////////////////////////////////////////////////////////// + + def pack(self, cnf={}, **kw): + # "@include: Tkinter.Pack.pack" + self._itemframe.pack(cnf, **kw) + + def grid(self, cnf={}, **kw): + # "@include: Tkinter.Grid.grid" + self._itemframe.grid(cnf, *kw) + + def focus(self): + # "@include: Tkinter.Widget.focus" + self._textwidget.focus() + + # //////////////////////////////////////////////////////////// + # Internal Methods + # //////////////////////////////////////////////////////////// + + def _init_itemframe(self, options): + self._itemframe = Frame(self._parent) + + # Create the basic Text widget & scrollbar. + options.setdefault("background", "#e0e0e0") + self._textwidget = Text(self._itemframe, **options) + self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical") + self._textwidget.config(yscrollcommand=self._textscroll.set) + self._textscroll.config(command=self._textwidget.yview) + self._textscroll.pack(side="right", fill="y") + self._textwidget.pack(expand=1, fill="both", side="left") + + # Initialize the colorization tags + self._textwidget.tag_config( + "highlight", background="#e0ffff", border="1", relief="raised" + ) + self._init_colortags(self._textwidget, options) + + # How do I want to mark keyboard selection? + self._textwidget.tag_config("sel", foreground="") + self._textwidget.tag_config( + "sel", foreground="", background="", border="", underline=1 + ) + self._textwidget.tag_lower("highlight", "sel") + + def _fire_callback(self, event, itemnum): + if event not in self._callbacks: + return + if 0 <= itemnum < len(self._items): + item = self._items[itemnum] + else: + item = None + for cb_func in list(self._callbacks[event].keys()): + cb_func(item) + + def _buttonpress(self, event): + clickloc = "@%d,%d" % (event.x, event.y) + insert_point = self._textwidget.index(clickloc) + itemnum = int(insert_point.split(".")[0]) - 1 + self._fire_callback("click%d" % event.num, itemnum) + + def _keypress(self, event): + if event.keysym == "Return" or event.keysym == "space": + insert_point = self._textwidget.index("insert") + itemnum = int(insert_point.split(".")[0]) - 1 + self._fire_callback(event.keysym.lower(), itemnum) + return + elif event.keysym == "Down": + delta = "+1line" + elif event.keysym == "Up": + delta = "-1line" + elif event.keysym == "Next": + delta = "+10lines" + elif event.keysym == "Prior": + delta = "-10lines" + else: + return "continue" + + self._textwidget.mark_set("insert", "insert" + delta) + self._textwidget.see("insert") + self._textwidget.tag_remove("sel", "1.0", "end+1char") + self._textwidget.tag_add("sel", "insert linestart", "insert lineend") + + insert_point = self._textwidget.index("insert") + itemnum = int(insert_point.split(".")[0]) - 1 + self._fire_callback(event.keysym.lower(), itemnum) + + return "break" + + +##////////////////////////////////////////////////////// +## Improved OptionMenu +##////////////////////////////////////////////////////// + + +class MutableOptionMenu(Menubutton): + def __init__(self, master, values, **options): + self._callback = options.get("command") + if "command" in options: + del options["command"] + + # Create a variable + self._variable = variable = StringVar() + if len(values) > 0: + variable.set(values[0]) + + kw = { + "borderwidth": 2, + "textvariable": variable, + "indicatoron": 1, + "relief": RAISED, + "anchor": "c", + "highlightthickness": 2, + } + kw.update(options) + Widget.__init__(self, master, "menubutton", kw) + self.widgetName = "tk_optionMenu" + self._menu = Menu(self, name="menu", tearoff=0) + self.menuname = self._menu._w + + self._values = [] + for value in values: + self.add(value) + + self["menu"] = self._menu + + def add(self, value): + if value in self._values: + return + + def set(value=value): + self.set(value) + + self._menu.add_command(label=value, command=set) + self._values.append(value) + + def set(self, value): + self._variable.set(value) + if self._callback: + self._callback(value) + + def remove(self, value): + # Might raise indexerror: pass to parent. + i = self._values.index(value) + del self._values[i] + self._menu.delete(i, i) + + def __getitem__(self, name): + if name == "menu": + return self.__menu + return Widget.__getitem__(self, name) + + def destroy(self): + """Destroy this widget and the associated menu.""" + Menubutton.destroy(self) + self._menu = None + + +##////////////////////////////////////////////////////// +## Test code. +##////////////////////////////////////////////////////// + + +def demo(): + """ + A simple demonstration showing how to use canvas widgets. + """ + + def fill(cw): + from random import randint + + cw["fill"] = "#00%04d" % randint(0, 9999) + + def color(cw): + from random import randint + + cw["color"] = "#ff%04d" % randint(0, 9999) + + cf = CanvasFrame(closeenough=10, width=300, height=300) + c = cf.canvas() + ct3 = TextWidget(c, "hiya there", draggable=1) + ct2 = TextWidget(c, "o o\n||\n___\n U", draggable=1, justify="center") + co = OvalWidget(c, ct2, outline="red") + ct = TextWidget(c, "o o\n||\n\\___/", draggable=1, justify="center") + cp = ParenWidget(c, ct, color="red") + cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10) + equation = SequenceWidget( + c, + SymbolWidget(c, "forall"), + TextWidget(c, "x"), + SymbolWidget(c, "exists"), + TextWidget(c, "y: "), + TextWidget(c, "x"), + SymbolWidget(c, "notequal"), + TextWidget(c, "y"), + ) + space = SpaceWidget(c, 0, 30) + cstack = StackWidget(c, cb, ct3, space, co, equation, align="center") + prompt_msg = TextWidget( + c, "try clicking\nand dragging", draggable=1, justify="center" + ) + cs = SequenceWidget(c, cstack, prompt_msg) + zz = BracketWidget(c, cs, color="green4", width=3) + cf.add_widget(zz, 60, 30) + + cb.bind_click(fill) + ct.bind_click(color) + co.bind_click(fill) + ct2.bind_click(color) + ct3.bind_click(color) + + cf.mainloop() + # ShowText(None, 'title', ((('this is text'*150)+'\n')*5)) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/featstruct.py b/.eggs/nltk-3.8-py3.10.egg/nltk/featstruct.py new file mode 100644 index 0000000000000000000000000000000000000000..fcbb54277fd9492f9f03dbe35e4ab3c06c8ef4b4 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/featstruct.py @@ -0,0 +1,2779 @@ +# Natural Language Toolkit: Feature Structures +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper , +# Rob Speer, +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Basic data classes for representing feature structures, and for +performing basic operations on those feature structures. A feature +structure is a mapping from feature identifiers to feature values, +where each feature value is either a basic value (such as a string or +an integer), or a nested feature structure. There are two types of +feature structure, implemented by two subclasses of ``FeatStruct``: + + - feature dictionaries, implemented by ``FeatDict``, act like + Python dictionaries. Feature identifiers may be strings or + instances of the ``Feature`` class. + - feature lists, implemented by ``FeatList``, act like Python + lists. Feature identifiers are integers. + +Feature structures are typically used to represent partial information +about objects. A feature identifier that is not mapped to a value +stands for a feature whose value is unknown (*not* a feature without +a value). Two feature structures that represent (potentially +overlapping) information about the same object can be combined by +unification. When two inconsistent feature structures are unified, +the unification fails and returns None. + +Features can be specified using "feature paths", or tuples of feature +identifiers that specify path through the nested feature structures to +a value. Feature structures may contain reentrant feature values. A +"reentrant feature value" is a single feature value that can be +accessed via multiple feature paths. Unification preserves the +reentrance relations imposed by both of the unified feature +structures. In the feature structure resulting from unification, any +modifications to a reentrant feature value will be visible using any +of its feature paths. + +Feature structure variables are encoded using the ``nltk.sem.Variable`` +class. The variables' values are tracked using a bindings +dictionary, which maps variables to their values. When two feature +structures are unified, a fresh bindings dictionary is created to +track their values; and before unification completes, all bound +variables are replaced by their values. Thus, the bindings +dictionaries are usually strictly internal to the unification process. +However, it is possible to track the bindings of variables if you +choose to, by supplying your own initial bindings dictionary to the +``unify()`` function. + +When unbound variables are unified with one another, they become +aliased. This is encoded by binding one variable to the other. + +Lightweight Feature Structures +============================== +Many of the functions defined by ``nltk.featstruct`` can be applied +directly to simple Python dictionaries and lists, rather than to +full-fledged ``FeatDict`` and ``FeatList`` objects. In other words, +Python ``dicts`` and ``lists`` can be used as "light-weight" feature +structures. + + >>> from nltk.featstruct import unify + >>> unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b'))) # doctest: +SKIP + {'y': {'b': 'b'}, 'x': 1, 'a': 'a'} + +However, you should keep in mind the following caveats: + + - Python dictionaries & lists ignore reentrance when checking for + equality between values. But two FeatStructs with different + reentrances are considered nonequal, even if all their base + values are equal. + + - FeatStructs can be easily frozen, allowing them to be used as + keys in hash tables. Python dictionaries and lists can not. + + - FeatStructs display reentrance in their string representations; + Python dictionaries and lists do not. + + - FeatStructs may *not* be mixed with Python dictionaries and lists + (e.g., when performing unification). + + - FeatStructs provide a number of useful methods, such as ``walk()`` + and ``cyclic()``, which are not available for Python dicts and lists. + +In general, if your feature structures will contain any reentrances, +or if you plan to use them as dictionary keys, it is strongly +recommended that you use full-fledged ``FeatStruct`` objects. +""" + +import copy +import re +from functools import total_ordering + +from nltk.internals import raise_unorderable_types, read_str +from nltk.sem.logic import ( + Expression, + LogicalExpressionException, + LogicParser, + SubstituteBindingsI, + Variable, +) + +###################################################################### +# Feature Structure +###################################################################### + + +@total_ordering +class FeatStruct(SubstituteBindingsI): + """ + A mapping from feature identifiers to feature values, where each + feature value is either a basic value (such as a string or an + integer), or a nested feature structure. There are two types of + feature structure: + + - feature dictionaries, implemented by ``FeatDict``, act like + Python dictionaries. Feature identifiers may be strings or + instances of the ``Feature`` class. + - feature lists, implemented by ``FeatList``, act like Python + lists. Feature identifiers are integers. + + Feature structures may be indexed using either simple feature + identifiers or 'feature paths.' A feature path is a sequence + of feature identifiers that stand for a corresponding sequence of + indexing operations. In particular, ``fstruct[(f1,f2,...,fn)]`` is + equivalent to ``fstruct[f1][f2]...[fn]``. + + Feature structures may contain reentrant feature structures. A + "reentrant feature structure" is a single feature structure + object that can be accessed via multiple feature paths. Feature + structures may also be cyclic. A feature structure is "cyclic" + if there is any feature path from the feature structure to itself. + + Two feature structures are considered equal if they assign the + same values to all features, and have the same reentrancies. + + By default, feature structures are mutable. They may be made + immutable with the ``freeze()`` method. Once they have been + frozen, they may be hashed, and thus used as dictionary keys. + """ + + _frozen = False + """:ivar: A flag indicating whether this feature structure is + frozen or not. Once this flag is set, it should never be + un-set; and no further modification should be made to this + feature structure.""" + + ##//////////////////////////////////////////////////////////// + # { Constructor + ##//////////////////////////////////////////////////////////// + + def __new__(cls, features=None, **morefeatures): + """ + Construct and return a new feature structure. If this + constructor is called directly, then the returned feature + structure will be an instance of either the ``FeatDict`` class + or the ``FeatList`` class. + + :param features: The initial feature values for this feature + structure: + + - FeatStruct(string) -> FeatStructReader().read(string) + - FeatStruct(mapping) -> FeatDict(mapping) + - FeatStruct(sequence) -> FeatList(sequence) + - FeatStruct() -> FeatDict() + :param morefeatures: If ``features`` is a mapping or None, + then ``morefeatures`` provides additional features for the + ``FeatDict`` constructor. + """ + # If the FeatStruct constructor is called directly, then decide + # whether to create a FeatDict or a FeatList, based on the + # contents of the `features` argument. + if cls is FeatStruct: + if features is None: + return FeatDict.__new__(FeatDict, **morefeatures) + elif _is_mapping(features): + return FeatDict.__new__(FeatDict, features, **morefeatures) + elif morefeatures: + raise TypeError( + "Keyword arguments may only be specified " + "if features is None or is a mapping." + ) + if isinstance(features, str): + if FeatStructReader._START_FDICT_RE.match(features): + return FeatDict.__new__(FeatDict, features, **morefeatures) + else: + return FeatList.__new__(FeatList, features, **morefeatures) + elif _is_sequence(features): + return FeatList.__new__(FeatList, features) + else: + raise TypeError("Expected string or mapping or sequence") + + # Otherwise, construct the object as normal. + else: + return super().__new__(cls, features, **morefeatures) + + ##//////////////////////////////////////////////////////////// + # { Uniform Accessor Methods + ##//////////////////////////////////////////////////////////// + # These helper functions allow the methods defined by FeatStruct + # to treat all feature structures as mappings, even if they're + # really lists. (Lists are treated as mappings from ints to vals) + + def _keys(self): + """Return an iterable of the feature identifiers used by this + FeatStruct.""" + raise NotImplementedError() # Implemented by subclasses. + + def _values(self): + """Return an iterable of the feature values directly defined + by this FeatStruct.""" + raise NotImplementedError() # Implemented by subclasses. + + def _items(self): + """Return an iterable of (fid,fval) pairs, where fid is a + feature identifier and fval is the corresponding feature + value, for all features defined by this FeatStruct.""" + raise NotImplementedError() # Implemented by subclasses. + + ##//////////////////////////////////////////////////////////// + # { Equality & Hashing + ##//////////////////////////////////////////////////////////// + + def equal_values(self, other, check_reentrance=False): + """ + Return True if ``self`` and ``other`` assign the same value to + to every feature. In particular, return true if + ``self[p]==other[p]`` for every feature path *p* such + that ``self[p]`` or ``other[p]`` is a base value (i.e., + not a nested feature structure). + + :param check_reentrance: If True, then also return False if + there is any difference between the reentrances of ``self`` + and ``other``. + :note: the ``==`` is equivalent to ``equal_values()`` with + ``check_reentrance=True``. + """ + return self._equal(other, check_reentrance, set(), set(), set()) + + def __eq__(self, other): + """ + Return true if ``self`` and ``other`` are both feature structures, + assign the same values to all features, and contain the same + reentrances. I.e., return + ``self.equal_values(other, check_reentrance=True)``. + + :see: ``equal_values()`` + """ + return self._equal(other, True, set(), set(), set()) + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + if not isinstance(other, FeatStruct): + # raise_unorderable_types("<", self, other) + # Sometimes feature values can be pure strings, + # so we need to be able to compare with non-featstructs: + return self.__class__.__name__ < other.__class__.__name__ + else: + return len(self) < len(other) + + def __hash__(self): + """ + If this feature structure is frozen, return its hash value; + otherwise, raise ``TypeError``. + """ + if not self._frozen: + raise TypeError("FeatStructs must be frozen before they " "can be hashed.") + try: + return self._hash + except AttributeError: + self._hash = self._calculate_hashvalue(set()) + return self._hash + + def _equal( + self, other, check_reentrance, visited_self, visited_other, visited_pairs + ): + """ + Return True iff self and other have equal values. + + :param visited_self: A set containing the ids of all ``self`` + feature structures we've already visited. + :param visited_other: A set containing the ids of all ``other`` + feature structures we've already visited. + :param visited_pairs: A set containing ``(selfid, otherid)`` pairs + for all pairs of feature structures we've already visited. + """ + # If we're the same object, then we're equal. + if self is other: + return True + + # If we have different classes, we're definitely not equal. + if self.__class__ != other.__class__: + return False + + # If we define different features, we're definitely not equal. + # (Perform len test first because it's faster -- we should + # do profiling to see if this actually helps) + if len(self) != len(other): + return False + if set(self._keys()) != set(other._keys()): + return False + + # If we're checking reentrance, then any time we revisit a + # structure, make sure that it was paired with the same + # feature structure that it is now. Note: if check_reentrance, + # then visited_pairs will never contain two pairs whose first + # values are equal, or two pairs whose second values are equal. + if check_reentrance: + if id(self) in visited_self or id(other) in visited_other: + return (id(self), id(other)) in visited_pairs + + # If we're not checking reentrance, then we still need to deal + # with cycles. If we encounter the same (self, other) pair a + # second time, then we won't learn anything more by examining + # their children a second time, so just return true. + else: + if (id(self), id(other)) in visited_pairs: + return True + + # Keep track of which nodes we've visited. + visited_self.add(id(self)) + visited_other.add(id(other)) + visited_pairs.add((id(self), id(other))) + + # Now we have to check all values. If any of them don't match, + # then return false. + for (fname, self_fval) in self._items(): + other_fval = other[fname] + if isinstance(self_fval, FeatStruct): + if not self_fval._equal( + other_fval, + check_reentrance, + visited_self, + visited_other, + visited_pairs, + ): + return False + else: + if self_fval != other_fval: + return False + + # Everything matched up; return true. + return True + + def _calculate_hashvalue(self, visited): + """ + Return a hash value for this feature structure. + + :require: ``self`` must be frozen. + :param visited: A set containing the ids of all feature + structures we've already visited while hashing. + """ + if id(self) in visited: + return 1 + visited.add(id(self)) + + hashval = 5831 + for (fname, fval) in sorted(self._items()): + hashval *= 37 + hashval += hash(fname) + hashval *= 37 + if isinstance(fval, FeatStruct): + hashval += fval._calculate_hashvalue(visited) + else: + hashval += hash(fval) + # Convert to a 32 bit int. + hashval = int(hashval & 0x7FFFFFFF) + return hashval + + ##//////////////////////////////////////////////////////////// + # { Freezing + ##//////////////////////////////////////////////////////////// + + #: Error message used by mutating methods when called on a frozen + #: feature structure. + _FROZEN_ERROR = "Frozen FeatStructs may not be modified." + + def freeze(self): + """ + Make this feature structure, and any feature structures it + contains, immutable. Note: this method does not attempt to + 'freeze' any feature value that is not a ``FeatStruct``; it + is recommended that you use only immutable feature values. + """ + if self._frozen: + return + self._freeze(set()) + + def frozen(self): + """ + Return True if this feature structure is immutable. Feature + structures can be made immutable with the ``freeze()`` method. + Immutable feature structures may not be made mutable again, + but new mutable copies can be produced with the ``copy()`` method. + """ + return self._frozen + + def _freeze(self, visited): + """ + Make this feature structure, and any feature structure it + contains, immutable. + + :param visited: A set containing the ids of all feature + structures we've already visited while freezing. + """ + if id(self) in visited: + return + visited.add(id(self)) + self._frozen = True + for (fname, fval) in sorted(self._items()): + if isinstance(fval, FeatStruct): + fval._freeze(visited) + + ##//////////////////////////////////////////////////////////// + # { Copying + ##//////////////////////////////////////////////////////////// + + def copy(self, deep=True): + """ + Return a new copy of ``self``. The new copy will not be frozen. + + :param deep: If true, create a deep copy; if false, create + a shallow copy. + """ + if deep: + return copy.deepcopy(self) + else: + return self.__class__(self) + + # Subclasses should define __deepcopy__ to ensure that the new + # copy will not be frozen. + def __deepcopy__(self, memo): + raise NotImplementedError() # Implemented by subclasses. + + ##//////////////////////////////////////////////////////////// + # { Structural Information + ##//////////////////////////////////////////////////////////// + + def cyclic(self): + """ + Return True if this feature structure contains itself. + """ + return self._find_reentrances({})[id(self)] + + def walk(self): + """ + Return an iterator that generates this feature structure, and + each feature structure it contains. Each feature structure will + be generated exactly once. + """ + return self._walk(set()) + + def _walk(self, visited): + """ + Return an iterator that generates this feature structure, and + each feature structure it contains. + + :param visited: A set containing the ids of all feature + structures we've already visited while freezing. + """ + raise NotImplementedError() # Implemented by subclasses. + + def _walk(self, visited): + if id(self) in visited: + return + visited.add(id(self)) + yield self + for fval in self._values(): + if isinstance(fval, FeatStruct): + yield from fval._walk(visited) + + # Walk through the feature tree. The first time we see a feature + # value, map it to False (not reentrant). If we see a feature + # value more than once, then map it to True (reentrant). + def _find_reentrances(self, reentrances): + """ + Return a dictionary that maps from the ``id`` of each feature + structure contained in ``self`` (including ``self``) to a + boolean value, indicating whether it is reentrant or not. + """ + if id(self) in reentrances: + # We've seen it more than once. + reentrances[id(self)] = True + else: + # This is the first time we've seen it. + reentrances[id(self)] = False + + # Recurse to contained feature structures. + for fval in self._values(): + if isinstance(fval, FeatStruct): + fval._find_reentrances(reentrances) + + return reentrances + + ##//////////////////////////////////////////////////////////// + # { Variables & Bindings + ##//////////////////////////////////////////////////////////// + + def substitute_bindings(self, bindings): + """:see: ``nltk.featstruct.substitute_bindings()``""" + return substitute_bindings(self, bindings) + + def retract_bindings(self, bindings): + """:see: ``nltk.featstruct.retract_bindings()``""" + return retract_bindings(self, bindings) + + def variables(self): + """:see: ``nltk.featstruct.find_variables()``""" + return find_variables(self) + + def rename_variables(self, vars=None, used_vars=(), new_vars=None): + """:see: ``nltk.featstruct.rename_variables()``""" + return rename_variables(self, vars, used_vars, new_vars) + + def remove_variables(self): + """ + Return the feature structure that is obtained by deleting + any feature whose value is a ``Variable``. + + :rtype: FeatStruct + """ + return remove_variables(self) + + ##//////////////////////////////////////////////////////////// + # { Unification + ##//////////////////////////////////////////////////////////// + + def unify(self, other, bindings=None, trace=False, fail=None, rename_vars=True): + return unify(self, other, bindings, trace, fail, rename_vars) + + def subsumes(self, other): + """ + Return True if ``self`` subsumes ``other``. I.e., return true + If unifying ``self`` with ``other`` would result in a feature + structure equal to ``other``. + """ + return subsumes(self, other) + + ##//////////////////////////////////////////////////////////// + # { String Representations + ##//////////////////////////////////////////////////////////// + + def __repr__(self): + """ + Display a single-line representation of this feature structure, + suitable for embedding in other representations. + """ + return self._repr(self._find_reentrances({}), {}) + + def _repr(self, reentrances, reentrance_ids): + """ + Return a string representation of this feature structure. + + :param reentrances: A dictionary that maps from the ``id`` of + each feature value in self, indicating whether that value + is reentrant or not. + :param reentrance_ids: A dictionary mapping from each ``id`` + of a feature value to a unique identifier. This is modified + by ``repr``: the first time a reentrant feature value is + displayed, an identifier is added to ``reentrance_ids`` for it. + """ + raise NotImplementedError() + + +# Mutation: disable if frozen. +_FROZEN_ERROR = "Frozen FeatStructs may not be modified." +_FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError." + + +def _check_frozen(method, indent=""): + """ + Given a method function, return a new method function that first + checks if ``self._frozen`` is true; and if so, raises ``ValueError`` + with an appropriate message. Otherwise, call the method and return + its result. + """ + + def wrapped(self, *args, **kwargs): + if self._frozen: + raise ValueError(_FROZEN_ERROR) + else: + return method(self, *args, **kwargs) + + wrapped.__name__ = method.__name__ + wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent) + return wrapped + + +###################################################################### +# Feature Dictionary +###################################################################### + + +class FeatDict(FeatStruct, dict): + """ + A feature structure that acts like a Python dictionary. I.e., a + mapping from feature identifiers to feature values, where a feature + identifier can be a string or a ``Feature``; and where a feature value + can be either a basic value (such as a string or an integer), or a nested + feature structure. A feature identifiers for a ``FeatDict`` is + sometimes called a "feature name". + + Two feature dicts are considered equal if they assign the same + values to all features, and have the same reentrances. + + :see: ``FeatStruct`` for information about feature paths, reentrance, + cyclic feature structures, mutability, freezing, and hashing. + """ + + def __init__(self, features=None, **morefeatures): + """ + Create a new feature dictionary, with the specified features. + + :param features: The initial value for this feature + dictionary. If ``features`` is a ``FeatStruct``, then its + features are copied (shallow copy). If ``features`` is a + dict, then a feature is created for each item, mapping its + key to its value. If ``features`` is a string, then it is + processed using ``FeatStructReader``. If ``features`` is a list of + tuples ``(name, val)``, then a feature is created for each tuple. + :param morefeatures: Additional features for the new feature + dictionary. If a feature is listed under both ``features`` and + ``morefeatures``, then the value from ``morefeatures`` will be + used. + """ + if isinstance(features, str): + FeatStructReader().fromstring(features, self) + self.update(**morefeatures) + else: + # update() checks the types of features. + self.update(features, **morefeatures) + + # //////////////////////////////////////////////////////////// + # { Dict methods + # //////////////////////////////////////////////////////////// + _INDEX_ERROR = "Expected feature name or path. Got %r." + + def __getitem__(self, name_or_path): + """If the feature with the given name or path exists, return + its value; otherwise, raise ``KeyError``.""" + if isinstance(name_or_path, (str, Feature)): + return dict.__getitem__(self, name_or_path) + elif isinstance(name_or_path, tuple): + try: + val = self + for fid in name_or_path: + if not isinstance(val, FeatStruct): + raise KeyError # path contains base value + val = val[fid] + return val + except (KeyError, IndexError) as e: + raise KeyError(name_or_path) from e + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + def get(self, name_or_path, default=None): + """If the feature with the given name or path exists, return its + value; otherwise, return ``default``.""" + try: + return self[name_or_path] + except KeyError: + return default + + def __contains__(self, name_or_path): + """Return true if a feature with the given name or path exists.""" + try: + self[name_or_path] + return True + except KeyError: + return False + + def has_key(self, name_or_path): + """Return true if a feature with the given name or path exists.""" + return name_or_path in self + + def __delitem__(self, name_or_path): + """If the feature with the given name or path exists, delete + its value; otherwise, raise ``KeyError``.""" + if self._frozen: + raise ValueError(_FROZEN_ERROR) + if isinstance(name_or_path, (str, Feature)): + return dict.__delitem__(self, name_or_path) + elif isinstance(name_or_path, tuple): + if len(name_or_path) == 0: + raise ValueError("The path () can not be set") + else: + parent = self[name_or_path[:-1]] + if not isinstance(parent, FeatStruct): + raise KeyError(name_or_path) # path contains base value + del parent[name_or_path[-1]] + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + def __setitem__(self, name_or_path, value): + """Set the value for the feature with the given name or path + to ``value``. If ``name_or_path`` is an invalid path, raise + ``KeyError``.""" + if self._frozen: + raise ValueError(_FROZEN_ERROR) + if isinstance(name_or_path, (str, Feature)): + return dict.__setitem__(self, name_or_path, value) + elif isinstance(name_or_path, tuple): + if len(name_or_path) == 0: + raise ValueError("The path () can not be set") + else: + parent = self[name_or_path[:-1]] + if not isinstance(parent, FeatStruct): + raise KeyError(name_or_path) # path contains base value + parent[name_or_path[-1]] = value + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + clear = _check_frozen(dict.clear) + pop = _check_frozen(dict.pop) + popitem = _check_frozen(dict.popitem) + setdefault = _check_frozen(dict.setdefault) + + def update(self, features=None, **morefeatures): + if self._frozen: + raise ValueError(_FROZEN_ERROR) + if features is None: + items = () + elif hasattr(features, "items") and callable(features.items): + items = features.items() + elif hasattr(features, "__iter__"): + items = features + else: + raise ValueError("Expected mapping or list of tuples") + + for key, val in items: + if not isinstance(key, (str, Feature)): + raise TypeError("Feature names must be strings") + self[key] = val + for key, val in morefeatures.items(): + if not isinstance(key, (str, Feature)): + raise TypeError("Feature names must be strings") + self[key] = val + + ##//////////////////////////////////////////////////////////// + # { Copying + ##//////////////////////////////////////////////////////////// + + def __deepcopy__(self, memo): + memo[id(self)] = selfcopy = self.__class__() + for (key, val) in self._items(): + selfcopy[copy.deepcopy(key, memo)] = copy.deepcopy(val, memo) + return selfcopy + + ##//////////////////////////////////////////////////////////// + # { Uniform Accessor Methods + ##//////////////////////////////////////////////////////////// + + def _keys(self): + return self.keys() + + def _values(self): + return self.values() + + def _items(self): + return self.items() + + ##//////////////////////////////////////////////////////////// + # { String Representations + ##//////////////////////////////////////////////////////////// + + def __str__(self): + """ + Display a multi-line representation of this feature dictionary + as an FVM (feature value matrix). + """ + return "\n".join(self._str(self._find_reentrances({}), {})) + + def _repr(self, reentrances, reentrance_ids): + segments = [] + prefix = "" + suffix = "" + + # If this is the first time we've seen a reentrant structure, + # then assign it a unique identifier. + if reentrances[id(self)]: + assert id(self) not in reentrance_ids + reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) + + # sorting note: keys are unique strings, so we'll never fall + # through to comparing values. + for (fname, fval) in sorted(self.items()): + display = getattr(fname, "display", None) + if id(fval) in reentrance_ids: + segments.append(f"{fname}->({reentrance_ids[id(fval)]})") + elif ( + display == "prefix" and not prefix and isinstance(fval, (Variable, str)) + ): + prefix = "%s" % fval + elif display == "slash" and not suffix: + if isinstance(fval, Variable): + suffix = "/%s" % fval.name + else: + suffix = "/%s" % repr(fval) + elif isinstance(fval, Variable): + segments.append(f"{fname}={fval.name}") + elif fval is True: + segments.append("+%s" % fname) + elif fval is False: + segments.append("-%s" % fname) + elif isinstance(fval, Expression): + segments.append(f"{fname}=<{fval}>") + elif not isinstance(fval, FeatStruct): + segments.append(f"{fname}={repr(fval)}") + else: + fval_repr = fval._repr(reentrances, reentrance_ids) + segments.append(f"{fname}={fval_repr}") + # If it's reentrant, then add on an identifier tag. + if reentrances[id(self)]: + prefix = f"({reentrance_ids[id(self)]}){prefix}" + return "{}[{}]{}".format(prefix, ", ".join(segments), suffix) + + def _str(self, reentrances, reentrance_ids): + """ + :return: A list of lines composing a string representation of + this feature dictionary. + :param reentrances: A dictionary that maps from the ``id`` of + each feature value in self, indicating whether that value + is reentrant or not. + :param reentrance_ids: A dictionary mapping from each ``id`` + of a feature value to a unique identifier. This is modified + by ``repr``: the first time a reentrant feature value is + displayed, an identifier is added to ``reentrance_ids`` for + it. + """ + # If this is the first time we've seen a reentrant structure, + # then tack on an id string. + if reentrances[id(self)]: + assert id(self) not in reentrance_ids + reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) + + # Special case: empty feature dict. + if len(self) == 0: + if reentrances[id(self)]: + return ["(%s) []" % reentrance_ids[id(self)]] + else: + return ["[]"] + + # What's the longest feature name? Use this to align names. + maxfnamelen = max(len("%s" % k) for k in self.keys()) + + lines = [] + # sorting note: keys are unique strings, so we'll never fall + # through to comparing values. + for (fname, fval) in sorted(self.items()): + fname = ("%s" % fname).ljust(maxfnamelen) + if isinstance(fval, Variable): + lines.append(f"{fname} = {fval.name}") + + elif isinstance(fval, Expression): + lines.append(f"{fname} = <{fval}>") + + elif isinstance(fval, FeatList): + fval_repr = fval._repr(reentrances, reentrance_ids) + lines.append(f"{fname} = {repr(fval_repr)}") + + elif not isinstance(fval, FeatDict): + # It's not a nested feature structure -- just print it. + lines.append(f"{fname} = {repr(fval)}") + + elif id(fval) in reentrance_ids: + # It's a feature structure we've seen before -- print + # the reentrance id. + lines.append(f"{fname} -> ({reentrance_ids[id(fval)]})") + + else: + # It's a new feature structure. Separate it from + # other values by a blank line. + if lines and lines[-1] != "": + lines.append("") + + # Recursively print the feature's value (fval). + fval_lines = fval._str(reentrances, reentrance_ids) + + # Indent each line to make room for fname. + fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines] + + # Pick which line we'll display fname on, & splice it in. + nameline = (len(fval_lines) - 1) // 2 + fval_lines[nameline] = ( + fname + " =" + fval_lines[nameline][maxfnamelen + 2 :] + ) + + # Add the feature structure to the output. + lines += fval_lines + + # Separate FeatStructs by a blank line. + lines.append("") + + # Get rid of any excess blank lines. + if lines[-1] == "": + lines.pop() + + # Add brackets around everything. + maxlen = max(len(line) for line in lines) + lines = ["[ {}{} ]".format(line, " " * (maxlen - len(line))) for line in lines] + + # If it's reentrant, then add on an identifier tag. + if reentrances[id(self)]: + idstr = "(%s) " % reentrance_ids[id(self)] + lines = [(" " * len(idstr)) + l for l in lines] + idline = (len(lines) - 1) // 2 + lines[idline] = idstr + lines[idline][len(idstr) :] + + return lines + + +###################################################################### +# Feature List +###################################################################### + + +class FeatList(FeatStruct, list): + """ + A list of feature values, where each feature value is either a + basic value (such as a string or an integer), or a nested feature + structure. + + Feature lists may contain reentrant feature values. A "reentrant + feature value" is a single feature value that can be accessed via + multiple feature paths. Feature lists may also be cyclic. + + Two feature lists are considered equal if they assign the same + values to all features, and have the same reentrances. + + :see: ``FeatStruct`` for information about feature paths, reentrance, + cyclic feature structures, mutability, freezing, and hashing. + """ + + def __init__(self, features=()): + """ + Create a new feature list, with the specified features. + + :param features: The initial list of features for this feature + list. If ``features`` is a string, then it is paresd using + ``FeatStructReader``. Otherwise, it should be a sequence + of basic values and nested feature structures. + """ + if isinstance(features, str): + FeatStructReader().fromstring(features, self) + else: + list.__init__(self, features) + + # //////////////////////////////////////////////////////////// + # { List methods + # //////////////////////////////////////////////////////////// + _INDEX_ERROR = "Expected int or feature path. Got %r." + + def __getitem__(self, name_or_path): + if isinstance(name_or_path, int): + return list.__getitem__(self, name_or_path) + elif isinstance(name_or_path, tuple): + try: + val = self + for fid in name_or_path: + if not isinstance(val, FeatStruct): + raise KeyError # path contains base value + val = val[fid] + return val + except (KeyError, IndexError) as e: + raise KeyError(name_or_path) from e + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + def __delitem__(self, name_or_path): + """If the feature with the given name or path exists, delete + its value; otherwise, raise ``KeyError``.""" + if self._frozen: + raise ValueError(_FROZEN_ERROR) + if isinstance(name_or_path, (int, slice)): + return list.__delitem__(self, name_or_path) + elif isinstance(name_or_path, tuple): + if len(name_or_path) == 0: + raise ValueError("The path () can not be set") + else: + parent = self[name_or_path[:-1]] + if not isinstance(parent, FeatStruct): + raise KeyError(name_or_path) # path contains base value + del parent[name_or_path[-1]] + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + def __setitem__(self, name_or_path, value): + """Set the value for the feature with the given name or path + to ``value``. If ``name_or_path`` is an invalid path, raise + ``KeyError``.""" + if self._frozen: + raise ValueError(_FROZEN_ERROR) + if isinstance(name_or_path, (int, slice)): + return list.__setitem__(self, name_or_path, value) + elif isinstance(name_or_path, tuple): + if len(name_or_path) == 0: + raise ValueError("The path () can not be set") + else: + parent = self[name_or_path[:-1]] + if not isinstance(parent, FeatStruct): + raise KeyError(name_or_path) # path contains base value + parent[name_or_path[-1]] = value + else: + raise TypeError(self._INDEX_ERROR % name_or_path) + + # __delslice__ = _check_frozen(list.__delslice__, ' ') + # __setslice__ = _check_frozen(list.__setslice__, ' ') + __iadd__ = _check_frozen(list.__iadd__) + __imul__ = _check_frozen(list.__imul__) + append = _check_frozen(list.append) + extend = _check_frozen(list.extend) + insert = _check_frozen(list.insert) + pop = _check_frozen(list.pop) + remove = _check_frozen(list.remove) + reverse = _check_frozen(list.reverse) + sort = _check_frozen(list.sort) + + ##//////////////////////////////////////////////////////////// + # { Copying + ##//////////////////////////////////////////////////////////// + + def __deepcopy__(self, memo): + memo[id(self)] = selfcopy = self.__class__() + selfcopy.extend(copy.deepcopy(fval, memo) for fval in self) + return selfcopy + + ##//////////////////////////////////////////////////////////// + # { Uniform Accessor Methods + ##//////////////////////////////////////////////////////////// + + def _keys(self): + return list(range(len(self))) + + def _values(self): + return self + + def _items(self): + return enumerate(self) + + ##//////////////////////////////////////////////////////////// + # { String Representations + ##//////////////////////////////////////////////////////////// + + # Special handling for: reentrances, variables, expressions. + def _repr(self, reentrances, reentrance_ids): + # If this is the first time we've seen a reentrant structure, + # then assign it a unique identifier. + if reentrances[id(self)]: + assert id(self) not in reentrance_ids + reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) + prefix = "(%s)" % reentrance_ids[id(self)] + else: + prefix = "" + + segments = [] + for fval in self: + if id(fval) in reentrance_ids: + segments.append("->(%s)" % reentrance_ids[id(fval)]) + elif isinstance(fval, Variable): + segments.append(fval.name) + elif isinstance(fval, Expression): + segments.append("%s" % fval) + elif isinstance(fval, FeatStruct): + segments.append(fval._repr(reentrances, reentrance_ids)) + else: + segments.append("%s" % repr(fval)) + + return "{}[{}]".format(prefix, ", ".join(segments)) + + +###################################################################### +# Variables & Bindings +###################################################################### + + +def substitute_bindings(fstruct, bindings, fs_class="default"): + """ + Return the feature structure that is obtained by replacing each + variable bound by ``bindings`` with its binding. If a variable is + aliased to a bound variable, then it will be replaced by that + variable's value. If a variable is aliased to an unbound + variable, then it will be replaced by that variable. + + :type bindings: dict(Variable -> any) + :param bindings: A dictionary mapping from variables to values. + """ + if fs_class == "default": + fs_class = _default_fs_class(fstruct) + fstruct = copy.deepcopy(fstruct) + _substitute_bindings(fstruct, bindings, fs_class, set()) + return fstruct + + +def _substitute_bindings(fstruct, bindings, fs_class, visited): + # Visit each node only once: + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + + if _is_mapping(fstruct): + items = fstruct.items() + elif _is_sequence(fstruct): + items = enumerate(fstruct) + else: + raise ValueError("Expected mapping or sequence") + for (fname, fval) in items: + while isinstance(fval, Variable) and fval in bindings: + fval = fstruct[fname] = bindings[fval] + if isinstance(fval, fs_class): + _substitute_bindings(fval, bindings, fs_class, visited) + elif isinstance(fval, SubstituteBindingsI): + fstruct[fname] = fval.substitute_bindings(bindings) + + +def retract_bindings(fstruct, bindings, fs_class="default"): + """ + Return the feature structure that is obtained by replacing each + feature structure value that is bound by ``bindings`` with the + variable that binds it. A feature structure value must be + identical to a bound value (i.e., have equal id) to be replaced. + + ``bindings`` is modified to point to this new feature structure, + rather than the original feature structure. Feature structure + values in ``bindings`` may be modified if they are contained in + ``fstruct``. + """ + if fs_class == "default": + fs_class = _default_fs_class(fstruct) + (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings)) + bindings.update(new_bindings) + inv_bindings = {id(val): var for (var, val) in bindings.items()} + _retract_bindings(fstruct, inv_bindings, fs_class, set()) + return fstruct + + +def _retract_bindings(fstruct, inv_bindings, fs_class, visited): + # Visit each node only once: + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + + if _is_mapping(fstruct): + items = fstruct.items() + elif _is_sequence(fstruct): + items = enumerate(fstruct) + else: + raise ValueError("Expected mapping or sequence") + for (fname, fval) in items: + if isinstance(fval, fs_class): + if id(fval) in inv_bindings: + fstruct[fname] = inv_bindings[id(fval)] + _retract_bindings(fval, inv_bindings, fs_class, visited) + + +def find_variables(fstruct, fs_class="default"): + """ + :return: The set of variables used by this feature structure. + :rtype: set(Variable) + """ + if fs_class == "default": + fs_class = _default_fs_class(fstruct) + return _variables(fstruct, set(), fs_class, set()) + + +def _variables(fstruct, vars, fs_class, visited): + # Visit each node only once: + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + if _is_mapping(fstruct): + items = fstruct.items() + elif _is_sequence(fstruct): + items = enumerate(fstruct) + else: + raise ValueError("Expected mapping or sequence") + for (fname, fval) in items: + if isinstance(fval, Variable): + vars.add(fval) + elif isinstance(fval, fs_class): + _variables(fval, vars, fs_class, visited) + elif isinstance(fval, SubstituteBindingsI): + vars.update(fval.variables()) + return vars + + +def rename_variables( + fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default" +): + """ + Return the feature structure that is obtained by replacing + any of this feature structure's variables that are in ``vars`` + with new variables. The names for these new variables will be + names that are not used by any variable in ``vars``, or in + ``used_vars``, or in this feature structure. + + :type vars: set + :param vars: The set of variables that should be renamed. + If not specified, ``find_variables(fstruct)`` is used; i.e., all + variables will be given new names. + :type used_vars: set + :param used_vars: A set of variables whose names should not be + used by the new variables. + :type new_vars: dict(Variable -> Variable) + :param new_vars: A dictionary that is used to hold the mapping + from old variables to new variables. For each variable *v* + in this feature structure: + + - If ``new_vars`` maps *v* to *v'*, then *v* will be + replaced by *v'*. + - If ``new_vars`` does not contain *v*, but ``vars`` + does contain *v*, then a new entry will be added to + ``new_vars``, mapping *v* to the new variable that is used + to replace it. + + To consistently rename the variables in a set of feature + structures, simply apply rename_variables to each one, using + the same dictionary: + + >>> from nltk.featstruct import FeatStruct + >>> fstruct1 = FeatStruct('[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]') + >>> fstruct2 = FeatStruct('[subj=[agr=[number=?z,gender=?y]], obj=[agr=[number=?z,gender=?y]]]') + >>> new_vars = {} # Maps old vars to alpha-renamed vars + >>> fstruct1.rename_variables(new_vars=new_vars) + [obj=[agr=[gender=?y2]], subj=[agr=[gender=?y2]]] + >>> fstruct2.rename_variables(new_vars=new_vars) + [obj=[agr=[gender=?y2, number=?z2]], subj=[agr=[gender=?y2, number=?z2]]] + + If new_vars is not specified, then an empty dictionary is used. + """ + if fs_class == "default": + fs_class = _default_fs_class(fstruct) + + # Default values: + if new_vars is None: + new_vars = {} + if vars is None: + vars = find_variables(fstruct, fs_class) + else: + vars = set(vars) + + # Add our own variables to used_vars. + used_vars = find_variables(fstruct, fs_class).union(used_vars) + + # Copy ourselves, and rename variables in the copy. + return _rename_variables( + copy.deepcopy(fstruct), vars, used_vars, new_vars, fs_class, set() + ) + + +def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited): + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + if _is_mapping(fstruct): + items = fstruct.items() + elif _is_sequence(fstruct): + items = enumerate(fstruct) + else: + raise ValueError("Expected mapping or sequence") + for (fname, fval) in items: + if isinstance(fval, Variable): + # If it's in new_vars, then rebind it. + if fval in new_vars: + fstruct[fname] = new_vars[fval] + # If it's in vars, pick a new name for it. + elif fval in vars: + new_vars[fval] = _rename_variable(fval, used_vars) + fstruct[fname] = new_vars[fval] + used_vars.add(new_vars[fval]) + elif isinstance(fval, fs_class): + _rename_variables(fval, vars, used_vars, new_vars, fs_class, visited) + elif isinstance(fval, SubstituteBindingsI): + # Pick new names for any variables in `vars` + for var in fval.variables(): + if var in vars and var not in new_vars: + new_vars[var] = _rename_variable(var, used_vars) + used_vars.add(new_vars[var]) + # Replace all variables in `new_vars`. + fstruct[fname] = fval.substitute_bindings(new_vars) + return fstruct + + +def _rename_variable(var, used_vars): + name, n = re.sub(r"\d+$", "", var.name), 2 + if not name: + name = "?" + while Variable(f"{name}{n}") in used_vars: + n += 1 + return Variable(f"{name}{n}") + + +def remove_variables(fstruct, fs_class="default"): + """ + :rtype: FeatStruct + :return: The feature structure that is obtained by deleting + all features whose values are ``Variables``. + """ + if fs_class == "default": + fs_class = _default_fs_class(fstruct) + return _remove_variables(copy.deepcopy(fstruct), fs_class, set()) + + +def _remove_variables(fstruct, fs_class, visited): + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + + if _is_mapping(fstruct): + items = list(fstruct.items()) + elif _is_sequence(fstruct): + items = list(enumerate(fstruct)) + else: + raise ValueError("Expected mapping or sequence") + + for (fname, fval) in items: + if isinstance(fval, Variable): + del fstruct[fname] + elif isinstance(fval, fs_class): + _remove_variables(fval, fs_class, visited) + return fstruct + + +###################################################################### +# Unification +###################################################################### + + +class _UnificationFailure: + def __repr__(self): + return "nltk.featstruct.UnificationFailure" + + +UnificationFailure = _UnificationFailure() +"""A unique value used to indicate unification failure. It can be + returned by ``Feature.unify_base_values()`` or by custom ``fail()`` + functions to indicate that unificaiton should fail.""" + + +# The basic unification algorithm: +# 1. Make copies of self and other (preserving reentrance) +# 2. Destructively unify self and other +# 3. Apply forward pointers, to preserve reentrance. +# 4. Replace bound variables with their values. +def unify( + fstruct1, + fstruct2, + bindings=None, + trace=False, + fail=None, + rename_vars=True, + fs_class="default", +): + """ + Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature + structure. This unified feature structure is the minimal + feature structure that contains all feature value assignments from both + ``fstruct1`` and ``fstruct2``, and that preserves all reentrancies. + + If no such feature structure exists (because ``fstruct1`` and + ``fstruct2`` specify incompatible values for some feature), then + unification fails, and ``unify`` returns None. + + Bound variables are replaced by their values. Aliased + variables are replaced by their representative variable + (if unbound) or the value of their representative variable + (if bound). I.e., if variable *v* is in ``bindings``, + then *v* is replaced by ``bindings[v]``. This will + be repeated until the variable is replaced by an unbound + variable or a non-variable value. + + Unbound variables are bound when they are unified with + values; and aliased when they are unified with variables. + I.e., if variable *v* is not in ``bindings``, and is + unified with a variable or value *x*, then + ``bindings[v]`` is set to *x*. + + If ``bindings`` is unspecified, then all variables are + assumed to be unbound. I.e., ``bindings`` defaults to an + empty dict. + + >>> from nltk.featstruct import FeatStruct + >>> FeatStruct('[a=?x]').unify(FeatStruct('[b=?x]')) + [a=?x, b=?x2] + + :type bindings: dict(Variable -> any) + :param bindings: A set of variable bindings to be used and + updated during unification. + :type trace: bool + :param trace: If true, generate trace output. + :type rename_vars: bool + :param rename_vars: If True, then rename any variables in + ``fstruct2`` that are also used in ``fstruct1``, in order to + avoid collisions on variable names. + """ + # Decide which class(es) will be treated as feature structures, + # for the purposes of unification. + if fs_class == "default": + fs_class = _default_fs_class(fstruct1) + if _default_fs_class(fstruct2) != fs_class: + raise ValueError( + "Mixing FeatStruct objects with Python " + "dicts and lists is not supported." + ) + assert isinstance(fstruct1, fs_class) + assert isinstance(fstruct2, fs_class) + + # If bindings are unspecified, use an empty set of bindings. + user_bindings = bindings is not None + if bindings is None: + bindings = {} + + # Make copies of fstruct1 and fstruct2 (since the unification + # algorithm is destructive). Do it all at once, to preserve + # reentrance links between fstruct1 and fstruct2. Copy bindings + # as well, in case there are any bound vars that contain parts + # of fstruct1 or fstruct2. + (fstruct1copy, fstruct2copy, bindings_copy) = copy.deepcopy( + (fstruct1, fstruct2, bindings) + ) + + # Copy the bindings back to the original bindings dict. + bindings.update(bindings_copy) + + if rename_vars: + vars1 = find_variables(fstruct1copy, fs_class) + vars2 = find_variables(fstruct2copy, fs_class) + _rename_variables(fstruct2copy, vars1, vars2, {}, fs_class, set()) + + # Do the actual unification. If it fails, return None. + forward = {} + if trace: + _trace_unify_start((), fstruct1copy, fstruct2copy) + try: + result = _destructively_unify( + fstruct1copy, fstruct2copy, bindings, forward, trace, fail, fs_class, () + ) + except _UnificationFailureError: + return None + + # _destructively_unify might return UnificationFailure, e.g. if we + # tried to unify a mapping with a sequence. + if result is UnificationFailure: + if fail is None: + return None + else: + return fail(fstruct1copy, fstruct2copy, ()) + + # Replace any feature structure that has a forward pointer + # with the target of its forward pointer. + result = _apply_forwards(result, forward, fs_class, set()) + if user_bindings: + _apply_forwards_to_bindings(forward, bindings) + + # Replace bound vars with values. + _resolve_aliases(bindings) + _substitute_bindings(result, bindings, fs_class, set()) + + # Return the result. + if trace: + _trace_unify_succeed((), result) + if trace: + _trace_bindings((), bindings) + return result + + +class _UnificationFailureError(Exception): + """An exception that is used by ``_destructively_unify`` to abort + unification when a failure is encountered.""" + + +def _destructively_unify( + fstruct1, fstruct2, bindings, forward, trace, fail, fs_class, path +): + """ + Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them + in-place. If the unification succeeds, then ``fstruct1`` will + contain the unified value, the value of ``fstruct2`` is undefined, + and forward[id(fstruct2)] is set to fstruct1. If the unification + fails, then a _UnificationFailureError is raised, and the + values of ``fstruct1`` and ``fstruct2`` are undefined. + + :param bindings: A dictionary mapping variables to values. + :param forward: A dictionary mapping feature structures ids + to replacement structures. When two feature structures + are merged, a mapping from one to the other will be added + to the forward dictionary; and changes will be made only + to the target of the forward dictionary. + ``_destructively_unify`` will always 'follow' any links + in the forward dictionary for fstruct1 and fstruct2 before + actually unifying them. + :param trace: If true, generate trace output + :param path: The feature path that led us to this unification + step. Used for trace output. + """ + # If fstruct1 is already identical to fstruct2, we're done. + # Note: this, together with the forward pointers, ensures + # that unification will terminate even for cyclic structures. + if fstruct1 is fstruct2: + if trace: + _trace_unify_identity(path, fstruct1) + return fstruct1 + + # Set fstruct2's forward pointer to point to fstruct1; this makes + # fstruct1 the canonical copy for fstruct2. Note that we need to + # do this before we recurse into any child structures, in case + # they're cyclic. + forward[id(fstruct2)] = fstruct1 + + # Unifying two mappings: + if _is_mapping(fstruct1) and _is_mapping(fstruct2): + for fname in fstruct1: + if getattr(fname, "default", None) is not None: + fstruct2.setdefault(fname, fname.default) + for fname in fstruct2: + if getattr(fname, "default", None) is not None: + fstruct1.setdefault(fname, fname.default) + + # Unify any values that are defined in both fstruct1 and + # fstruct2. Copy any values that are defined in fstruct2 but + # not in fstruct1 to fstruct1. Note: sorting fstruct2's + # features isn't actually necessary; but we do it to give + # deterministic behavior, e.g. for tracing. + for fname, fval2 in sorted(fstruct2.items()): + if fname in fstruct1: + fstruct1[fname] = _unify_feature_values( + fname, + fstruct1[fname], + fval2, + bindings, + forward, + trace, + fail, + fs_class, + path + (fname,), + ) + else: + fstruct1[fname] = fval2 + + return fstruct1 # Contains the unified value. + + # Unifying two sequences: + elif _is_sequence(fstruct1) and _is_sequence(fstruct2): + # If the lengths don't match, fail. + if len(fstruct1) != len(fstruct2): + return UnificationFailure + + # Unify corresponding values in fstruct1 and fstruct2. + for findex in range(len(fstruct1)): + fstruct1[findex] = _unify_feature_values( + findex, + fstruct1[findex], + fstruct2[findex], + bindings, + forward, + trace, + fail, + fs_class, + path + (findex,), + ) + + return fstruct1 # Contains the unified value. + + # Unifying sequence & mapping: fail. The failure function + # doesn't get a chance to recover in this case. + elif (_is_sequence(fstruct1) or _is_mapping(fstruct1)) and ( + _is_sequence(fstruct2) or _is_mapping(fstruct2) + ): + return UnificationFailure + + # Unifying anything else: not allowed! + raise TypeError("Expected mappings or sequences") + + +def _unify_feature_values( + fname, fval1, fval2, bindings, forward, trace, fail, fs_class, fpath +): + """ + Attempt to unify ``fval1`` and and ``fval2``, and return the + resulting unified value. The method of unification will depend on + the types of ``fval1`` and ``fval2``: + + 1. If they're both feature structures, then destructively + unify them (see ``_destructively_unify()``. + 2. If they're both unbound variables, then alias one variable + to the other (by setting bindings[v2]=v1). + 3. If one is an unbound variable, and the other is a value, + then bind the unbound variable to the value. + 4. If one is a feature structure, and the other is a base value, + then fail. + 5. If they're both base values, then unify them. By default, + this will succeed if they are equal, and fail otherwise. + """ + if trace: + _trace_unify_start(fpath, fval1, fval2) + + # Look up the "canonical" copy of fval1 and fval2 + while id(fval1) in forward: + fval1 = forward[id(fval1)] + while id(fval2) in forward: + fval2 = forward[id(fval2)] + + # If fval1 or fval2 is a bound variable, then + # replace it by the variable's bound value. This + # includes aliased variables, which are encoded as + # variables bound to other variables. + fvar1 = fvar2 = None + while isinstance(fval1, Variable) and fval1 in bindings: + fvar1 = fval1 + fval1 = bindings[fval1] + while isinstance(fval2, Variable) and fval2 in bindings: + fvar2 = fval2 + fval2 = bindings[fval2] + + # Case 1: Two feature structures (recursive case) + if isinstance(fval1, fs_class) and isinstance(fval2, fs_class): + result = _destructively_unify( + fval1, fval2, bindings, forward, trace, fail, fs_class, fpath + ) + + # Case 2: Two unbound variables (create alias) + elif isinstance(fval1, Variable) and isinstance(fval2, Variable): + if fval1 != fval2: + bindings[fval2] = fval1 + result = fval1 + + # Case 3: An unbound variable and a value (bind) + elif isinstance(fval1, Variable): + bindings[fval1] = fval2 + result = fval1 + elif isinstance(fval2, Variable): + bindings[fval2] = fval1 + result = fval2 + + # Case 4: A feature structure & a base value (fail) + elif isinstance(fval1, fs_class) or isinstance(fval2, fs_class): + result = UnificationFailure + + # Case 5: Two base values + else: + # Case 5a: Feature defines a custom unification method for base values + if isinstance(fname, Feature): + result = fname.unify_base_values(fval1, fval2, bindings) + # Case 5b: Feature value defines custom unification method + elif isinstance(fval1, CustomFeatureValue): + result = fval1.unify(fval2) + # Sanity check: unify value should be symmetric + if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1): + raise AssertionError( + "CustomFeatureValue objects %r and %r disagree " + "about unification value: %r vs. %r" + % (fval1, fval2, result, fval2.unify(fval1)) + ) + elif isinstance(fval2, CustomFeatureValue): + result = fval2.unify(fval1) + # Case 5c: Simple values -- check if they're equal. + else: + if fval1 == fval2: + result = fval1 + else: + result = UnificationFailure + + # If either value was a bound variable, then update the + # bindings. (This is really only necessary if fname is a + # Feature or if either value is a CustomFeatureValue.) + if result is not UnificationFailure: + if fvar1 is not None: + bindings[fvar1] = result + result = fvar1 + if fvar2 is not None and fvar2 != fvar1: + bindings[fvar2] = result + result = fvar2 + + # If we unification failed, call the failure function; it + # might decide to continue anyway. + if result is UnificationFailure: + if fail is not None: + result = fail(fval1, fval2, fpath) + if trace: + _trace_unify_fail(fpath[:-1], result) + if result is UnificationFailure: + raise _UnificationFailureError + + # Normalize the result. + if isinstance(result, fs_class): + result = _apply_forwards(result, forward, fs_class, set()) + + if trace: + _trace_unify_succeed(fpath, result) + if trace and isinstance(result, fs_class): + _trace_bindings(fpath, bindings) + + return result + + +def _apply_forwards_to_bindings(forward, bindings): + """ + Replace any feature structure that has a forward pointer with + the target of its forward pointer (to preserve reentrancy). + """ + for (var, value) in bindings.items(): + while id(value) in forward: + value = forward[id(value)] + bindings[var] = value + + +def _apply_forwards(fstruct, forward, fs_class, visited): + """ + Replace any feature structure that has a forward pointer with + the target of its forward pointer (to preserve reentrancy). + """ + # Follow our own forwards pointers (if any) + while id(fstruct) in forward: + fstruct = forward[id(fstruct)] + + # Visit each node only once: + if id(fstruct) in visited: + return + visited.add(id(fstruct)) + + if _is_mapping(fstruct): + items = fstruct.items() + elif _is_sequence(fstruct): + items = enumerate(fstruct) + else: + raise ValueError("Expected mapping or sequence") + for fname, fval in items: + if isinstance(fval, fs_class): + # Replace w/ forwarded value. + while id(fval) in forward: + fval = forward[id(fval)] + fstruct[fname] = fval + # Recurse to child. + _apply_forwards(fval, forward, fs_class, visited) + + return fstruct + + +def _resolve_aliases(bindings): + """ + Replace any bound aliased vars with their binding; and replace + any unbound aliased vars with their representative var. + """ + for (var, value) in bindings.items(): + while isinstance(value, Variable) and value in bindings: + value = bindings[var] = bindings[value] + + +def _trace_unify_start(path, fval1, fval2): + if path == (): + print("\nUnification trace:") + else: + fullname = ".".join("%s" % n for n in path) + print(" " + "| " * (len(path) - 1) + "|") + print(" " + "| " * (len(path) - 1) + "| Unify feature: %s" % fullname) + print(" " + "| " * len(path) + " / " + _trace_valrepr(fval1)) + print(" " + "| " * len(path) + "|\\ " + _trace_valrepr(fval2)) + + +def _trace_unify_identity(path, fval1): + print(" " + "| " * len(path) + "|") + print(" " + "| " * len(path) + "| (identical objects)") + print(" " + "| " * len(path) + "|") + print(" " + "| " * len(path) + "+-->" + repr(fval1)) + + +def _trace_unify_fail(path, result): + if result is UnificationFailure: + resume = "" + else: + resume = " (nonfatal)" + print(" " + "| " * len(path) + "| |") + print(" " + "X " * len(path) + "X X <-- FAIL" + resume) + + +def _trace_unify_succeed(path, fval1): + # Print the result. + print(" " + "| " * len(path) + "|") + print(" " + "| " * len(path) + "+-->" + repr(fval1)) + + +def _trace_bindings(path, bindings): + # Print the bindings (if any). + if len(bindings) > 0: + binditems = sorted(bindings.items(), key=lambda v: v[0].name) + bindstr = "{%s}" % ", ".join( + f"{var}: {_trace_valrepr(val)}" for (var, val) in binditems + ) + print(" " + "| " * len(path) + " Bindings: " + bindstr) + + +def _trace_valrepr(val): + if isinstance(val, Variable): + return "%s" % val + else: + return "%s" % repr(val) + + +def subsumes(fstruct1, fstruct2): + """ + Return True if ``fstruct1`` subsumes ``fstruct2``. I.e., return + true if unifying ``fstruct1`` with ``fstruct2`` would result in a + feature structure equal to ``fstruct2.`` + + :rtype: bool + """ + return fstruct2 == unify(fstruct1, fstruct2) + + +def conflicts(fstruct1, fstruct2, trace=0): + """ + Return a list of the feature paths of all features which are + assigned incompatible values by ``fstruct1`` and ``fstruct2``. + + :rtype: list(tuple) + """ + conflict_list = [] + + def add_conflict(fval1, fval2, path): + conflict_list.append(path) + return fval1 + + unify(fstruct1, fstruct2, fail=add_conflict, trace=trace) + return conflict_list + + +###################################################################### +# Helper Functions +###################################################################### + + +def _is_mapping(v): + return hasattr(v, "__contains__") and hasattr(v, "keys") + + +def _is_sequence(v): + return hasattr(v, "__iter__") and hasattr(v, "__len__") and not isinstance(v, str) + + +def _default_fs_class(obj): + if isinstance(obj, FeatStruct): + return FeatStruct + if isinstance(obj, (dict, list)): + return (dict, list) + else: + raise ValueError( + "To unify objects of type %s, you must specify " + "fs_class explicitly." % obj.__class__.__name__ + ) + + +###################################################################### +# FeatureValueSet & FeatureValueTuple +###################################################################### + + +class SubstituteBindingsSequence(SubstituteBindingsI): + """ + A mixin class for sequence classes that distributes variables() and + substitute_bindings() over the object's elements. + """ + + def variables(self): + return [elt for elt in self if isinstance(elt, Variable)] + sum( + ( + list(elt.variables()) + for elt in self + if isinstance(elt, SubstituteBindingsI) + ), + [], + ) + + def substitute_bindings(self, bindings): + return self.__class__([self.subst(v, bindings) for v in self]) + + def subst(self, v, bindings): + if isinstance(v, SubstituteBindingsI): + return v.substitute_bindings(bindings) + else: + return bindings.get(v, v) + + +class FeatureValueTuple(SubstituteBindingsSequence, tuple): + """ + A base feature value that is a tuple of other base feature values. + FeatureValueTuple implements ``SubstituteBindingsI``, so it any + variable substitutions will be propagated to the elements + contained by the set. A ``FeatureValueTuple`` is immutable. + """ + + def __repr__(self): # [xx] really use %s here? + if len(self) == 0: + return "()" + return "(%s)" % ", ".join(f"{b}" for b in self) + + +class FeatureValueSet(SubstituteBindingsSequence, frozenset): + """ + A base feature value that is a set of other base feature values. + FeatureValueSet implements ``SubstituteBindingsI``, so it any + variable substitutions will be propagated to the elements + contained by the set. A ``FeatureValueSet`` is immutable. + """ + + def __repr__(self): # [xx] really use %s here? + if len(self) == 0: + return "{/}" # distinguish from dict. + # n.b., we sort the string reprs of our elements, to ensure + # that our own repr is deterministic. + return "{%s}" % ", ".join(sorted(f"{b}" for b in self)) + + __str__ = __repr__ + + +class FeatureValueUnion(SubstituteBindingsSequence, frozenset): + """ + A base feature value that represents the union of two or more + ``FeatureValueSet`` or ``Variable``. + """ + + def __new__(cls, values): + # If values contains FeatureValueUnions, then collapse them. + values = _flatten(values, FeatureValueUnion) + + # If the resulting list contains no variables, then + # use a simple FeatureValueSet instead. + if sum(isinstance(v, Variable) for v in values) == 0: + values = _flatten(values, FeatureValueSet) + return FeatureValueSet(values) + + # If we contain a single variable, return that variable. + if len(values) == 1: + return list(values)[0] + + # Otherwise, build the FeatureValueUnion. + return frozenset.__new__(cls, values) + + def __repr__(self): + # n.b., we sort the string reprs of our elements, to ensure + # that our own repr is deterministic. also, note that len(self) + # is guaranteed to be 2 or more. + return "{%s}" % "+".join(sorted(f"{b}" for b in self)) + + +class FeatureValueConcat(SubstituteBindingsSequence, tuple): + """ + A base feature value that represents the concatenation of two or + more ``FeatureValueTuple`` or ``Variable``. + """ + + def __new__(cls, values): + # If values contains FeatureValueConcats, then collapse them. + values = _flatten(values, FeatureValueConcat) + + # If the resulting list contains no variables, then + # use a simple FeatureValueTuple instead. + if sum(isinstance(v, Variable) for v in values) == 0: + values = _flatten(values, FeatureValueTuple) + return FeatureValueTuple(values) + + # If we contain a single variable, return that variable. + if len(values) == 1: + return list(values)[0] + + # Otherwise, build the FeatureValueConcat. + return tuple.__new__(cls, values) + + def __repr__(self): + # n.b.: len(self) is guaranteed to be 2 or more. + return "(%s)" % "+".join(f"{b}" for b in self) + + +def _flatten(lst, cls): + """ + Helper function -- return a copy of list, with all elements of + type ``cls`` spliced in rather than appended in. + """ + result = [] + for elt in lst: + if isinstance(elt, cls): + result.extend(elt) + else: + result.append(elt) + return result + + +###################################################################### +# Specialized Features +###################################################################### + + +@total_ordering +class Feature: + """ + A feature identifier that's specialized to put additional + constraints, default values, etc. + """ + + def __init__(self, name, default=None, display=None): + assert display in (None, "prefix", "slash") + + self._name = name # [xx] rename to .identifier? + self._default = default # [xx] not implemented yet. + self._display = display + + if self._display == "prefix": + self._sortkey = (-1, self._name) + elif self._display == "slash": + self._sortkey = (1, self._name) + else: + self._sortkey = (0, self._name) + + @property + def name(self): + """The name of this feature.""" + return self._name + + @property + def default(self): + """Default value for this feature.""" + return self._default + + @property + def display(self): + """Custom display location: can be prefix, or slash.""" + return self._display + + def __repr__(self): + return "*%s*" % self.name + + def __lt__(self, other): + if isinstance(other, str): + return True + if not isinstance(other, Feature): + raise_unorderable_types("<", self, other) + return self._sortkey < other._sortkey + + def __eq__(self, other): + return type(self) == type(other) and self._name == other._name + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self._name) + + # //////////////////////////////////////////////////////////// + # These can be overridden by subclasses: + # //////////////////////////////////////////////////////////// + + def read_value(self, s, position, reentrances, parser): + return parser.read_value(s, position, reentrances) + + def unify_base_values(self, fval1, fval2, bindings): + """ + If possible, return a single value.. If not, return + the value ``UnificationFailure``. + """ + if fval1 == fval2: + return fval1 + else: + return UnificationFailure + + +class SlashFeature(Feature): + def read_value(self, s, position, reentrances, parser): + return parser.read_partial(s, position, reentrances) + + +class RangeFeature(Feature): + RANGE_RE = re.compile(r"(-?\d+):(-?\d+)") + + def read_value(self, s, position, reentrances, parser): + m = self.RANGE_RE.match(s, position) + if not m: + raise ValueError("range", position) + return (int(m.group(1)), int(m.group(2))), m.end() + + def unify_base_values(self, fval1, fval2, bindings): + if fval1 is None: + return fval2 + if fval2 is None: + return fval1 + rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1]) + if rng[1] < rng[0]: + return UnificationFailure + return rng + + +SLASH = SlashFeature("slash", default=False, display="slash") +TYPE = Feature("type", display="prefix") + + +###################################################################### +# Specialized Feature Values +###################################################################### + + +@total_ordering +class CustomFeatureValue: + """ + An abstract base class for base values that define a custom + unification method. The custom unification method of + ``CustomFeatureValue`` will be used during unification if: + + - The ``CustomFeatureValue`` is unified with another base value. + - The ``CustomFeatureValue`` is not the value of a customized + ``Feature`` (which defines its own unification method). + + If two ``CustomFeatureValue`` objects are unified with one another + during feature structure unification, then the unified base values + they return *must* be equal; otherwise, an ``AssertionError`` will + be raised. + + Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``. + Subclasses may also wish to define ``__hash__()``. + """ + + def unify(self, other): + """ + If this base value unifies with ``other``, then return the + unified value. Otherwise, return ``UnificationFailure``. + """ + raise NotImplementedError("abstract base class") + + def __eq__(self, other): + return NotImplemented + + def __ne__(self, other): + return not self == other + + def __lt__(self, other): + return NotImplemented + + def __hash__(self): + raise TypeError("%s objects or unhashable" % self.__class__.__name__) + + +###################################################################### +# Feature Structure Reader +###################################################################### + + +class FeatStructReader: + def __init__( + self, + features=(SLASH, TYPE), + fdict_class=FeatStruct, + flist_class=FeatList, + logic_parser=None, + ): + self._features = {f.name: f for f in features} + self._fdict_class = fdict_class + self._flist_class = flist_class + self._prefix_feature = None + self._slash_feature = None + for feature in features: + if feature.display == "slash": + if self._slash_feature: + raise ValueError("Multiple features w/ display=slash") + self._slash_feature = feature + if feature.display == "prefix": + if self._prefix_feature: + raise ValueError("Multiple features w/ display=prefix") + self._prefix_feature = feature + self._features_with_defaults = [ + feature for feature in features if feature.default is not None + ] + if logic_parser is None: + logic_parser = LogicParser() + self._logic_parser = logic_parser + + def fromstring(self, s, fstruct=None): + """ + Convert a string representation of a feature structure (as + displayed by repr) into a ``FeatStruct``. This process + imposes the following restrictions on the string + representation: + + - Feature names cannot contain any of the following: + whitespace, parentheses, quote marks, equals signs, + dashes, commas, and square brackets. Feature names may + not begin with plus signs or minus signs. + - Only the following basic feature value are supported: + strings, integers, variables, None, and unquoted + alphanumeric strings. + - For reentrant values, the first mention must specify + a reentrance identifier and a value; and any subsequent + mentions must use arrows (``'->'``) to reference the + reentrance identifier. + """ + s = s.strip() + value, position = self.read_partial(s, 0, {}, fstruct) + if position != len(s): + self._error(s, "end of string", position) + return value + + _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)") + _END_FSTRUCT_RE = re.compile(r"\s*]\s*") + _SLASH_RE = re.compile(r"/") + _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*') + _REENTRANCE_RE = re.compile(r"\s*->\s*") + _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*") + _ASSIGN_RE = re.compile(r"\s*=\s*") + _COMMA_RE = re.compile(r"\s*,\s*") + _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()") + # This one is used to distinguish fdicts from flists: + _START_FDICT_RE = re.compile( + r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))" + % ( + _BARE_PREFIX_RE.pattern, + _START_FSTRUCT_RE.pattern, + _FEATURE_NAME_RE.pattern, + _FEATURE_NAME_RE.pattern, + ) + ) + + def read_partial(self, s, position=0, reentrances=None, fstruct=None): + """ + Helper function that reads in a feature structure. + + :param s: The string to read. + :param position: The position in the string to start parsing. + :param reentrances: A dictionary from reentrance ids to values. + Defaults to an empty dictionary. + :return: A tuple (val, pos) of the feature structure created by + parsing and the position where the parsed feature structure ends. + :rtype: bool + """ + if reentrances is None: + reentrances = {} + try: + return self._read_partial(s, position, reentrances, fstruct) + except ValueError as e: + if len(e.args) != 2: + raise + self._error(s, *e.args) + + def _read_partial(self, s, position, reentrances, fstruct=None): + # Create the new feature structure + if fstruct is None: + if self._START_FDICT_RE.match(s, position): + fstruct = self._fdict_class() + else: + fstruct = self._flist_class() + + # Read up to the open bracket. + match = self._START_FSTRUCT_RE.match(s, position) + if not match: + match = self._BARE_PREFIX_RE.match(s, position) + if not match: + raise ValueError("open bracket or identifier", position) + position = match.end() + + # If there as an identifier, record it. + if match.group(1): + identifier = match.group(1) + if identifier in reentrances: + raise ValueError("new identifier", match.start(1)) + reentrances[identifier] = fstruct + + if isinstance(fstruct, FeatDict): + fstruct.clear() + return self._read_partial_featdict(s, position, match, reentrances, fstruct) + else: + del fstruct[:] + return self._read_partial_featlist(s, position, match, reentrances, fstruct) + + def _read_partial_featlist(self, s, position, match, reentrances, fstruct): + # Prefix features are not allowed: + if match.group(2): + raise ValueError("open bracket") + # Bare prefixes are not allowed: + if not match.group(3): + raise ValueError("open bracket") + + # Build a list of the features defined by the structure. + while position < len(s): + # Check for the close bracket. + match = self._END_FSTRUCT_RE.match(s, position) + if match is not None: + return fstruct, match.end() + + # Reentances have the form "-> (target)" + match = self._REENTRANCE_RE.match(s, position) + if match: + position = match.end() + match = self._TARGET_RE.match(s, position) + if not match: + raise ValueError("identifier", position) + target = match.group(1) + if target not in reentrances: + raise ValueError("bound identifier", position) + position = match.end() + fstruct.append(reentrances[target]) + + # Anything else is a value. + else: + value, position = self._read_value(0, s, position, reentrances) + fstruct.append(value) + + # If there's a close bracket, handle it at the top of the loop. + if self._END_FSTRUCT_RE.match(s, position): + continue + + # Otherwise, there should be a comma + match = self._COMMA_RE.match(s, position) + if match is None: + raise ValueError("comma", position) + position = match.end() + + # We never saw a close bracket. + raise ValueError("close bracket", position) + + def _read_partial_featdict(self, s, position, match, reentrances, fstruct): + # If there was a prefix feature, record it. + if match.group(2): + if self._prefix_feature is None: + raise ValueError("open bracket or identifier", match.start(2)) + prefixval = match.group(2).strip() + if prefixval.startswith("?"): + prefixval = Variable(prefixval) + fstruct[self._prefix_feature] = prefixval + + # If group 3 is empty, then we just have a bare prefix, so + # we're done. + if not match.group(3): + return self._finalize(s, match.end(), reentrances, fstruct) + + # Build a list of the features defined by the structure. + # Each feature has one of the three following forms: + # name = value + # name -> (target) + # +name + # -name + while position < len(s): + # Use these variables to hold info about each feature: + name = value = None + + # Check for the close bracket. + match = self._END_FSTRUCT_RE.match(s, position) + if match is not None: + return self._finalize(s, match.end(), reentrances, fstruct) + + # Get the feature name's name + match = self._FEATURE_NAME_RE.match(s, position) + if match is None: + raise ValueError("feature name", position) + name = match.group(2) + position = match.end() + + # Check if it's a special feature. + if name[0] == "*" and name[-1] == "*": + name = self._features.get(name[1:-1]) + if name is None: + raise ValueError("known special feature", match.start(2)) + + # Check if this feature has a value already. + if name in fstruct: + raise ValueError("new name", match.start(2)) + + # Boolean value ("+name" or "-name") + if match.group(1) == "+": + value = True + if match.group(1) == "-": + value = False + + # Reentrance link ("-> (target)") + if value is None: + match = self._REENTRANCE_RE.match(s, position) + if match is not None: + position = match.end() + match = self._TARGET_RE.match(s, position) + if not match: + raise ValueError("identifier", position) + target = match.group(1) + if target not in reentrances: + raise ValueError("bound identifier", position) + position = match.end() + value = reentrances[target] + + # Assignment ("= value"). + if value is None: + match = self._ASSIGN_RE.match(s, position) + if match: + position = match.end() + value, position = self._read_value(name, s, position, reentrances) + # None of the above: error. + else: + raise ValueError("equals sign", position) + + # Store the value. + fstruct[name] = value + + # If there's a close bracket, handle it at the top of the loop. + if self._END_FSTRUCT_RE.match(s, position): + continue + + # Otherwise, there should be a comma + match = self._COMMA_RE.match(s, position) + if match is None: + raise ValueError("comma", position) + position = match.end() + + # We never saw a close bracket. + raise ValueError("close bracket", position) + + def _finalize(self, s, pos, reentrances, fstruct): + """ + Called when we see the close brace -- checks for a slash feature, + and adds in default values. + """ + # Add the slash feature (if any) + match = self._SLASH_RE.match(s, pos) + if match: + name = self._slash_feature + v, pos = self._read_value(name, s, match.end(), reentrances) + fstruct[name] = v + ## Add any default features. -- handle in unficiation instead? + # for feature in self._features_with_defaults: + # fstruct.setdefault(feature, feature.default) + # Return the value. + return fstruct, pos + + def _read_value(self, name, s, position, reentrances): + if isinstance(name, Feature): + return name.read_value(s, position, reentrances, self) + else: + return self.read_value(s, position, reentrances) + + def read_value(self, s, position, reentrances): + for (handler, regexp) in self.VALUE_HANDLERS: + match = regexp.match(s, position) + if match: + handler_func = getattr(self, handler) + return handler_func(s, position, reentrances, match) + raise ValueError("value", position) + + def _error(self, s, expected, position): + lines = s.split("\n") + while position > len(lines[0]): + position -= len(lines.pop(0)) + 1 # +1 for the newline. + estr = ( + "Error parsing feature structure\n " + + lines[0] + + "\n " + + " " * position + + "^ " + + "Expected %s" % expected + ) + raise ValueError(estr) + + # //////////////////////////////////////////////////////////// + # { Value Readers + # //////////////////////////////////////////////////////////// + + #: A table indicating how feature values should be processed. Each + #: entry in the table is a pair (handler, regexp). The first entry + #: with a matching regexp will have its handler called. Handlers + #: should have the following signature:: + #: + #: def handler(s, position, reentrances, match): ... + #: + #: and should return a tuple (value, position), where position is + #: the string position where the value ended. (n.b.: order is + #: important here!) + VALUE_HANDLERS = [ + ("read_fstruct_value", _START_FSTRUCT_RE), + ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")), + ("read_str_value", re.compile("[uU]?[rR]?(['\"])")), + ("read_int_value", re.compile(r"-?\d+")), + ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")), + ( + "read_app_value", + re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"), + ), + # ('read_logic_value', re.compile(r'<([^>]*)>')), + # lazily match any character after '<' until we hit a '>' not preceded by '-' + ("read_logic_value", re.compile(r"<(.*?)(?")), + ("read_set_value", re.compile(r"{")), + ("read_tuple_value", re.compile(r"\(")), + ] + + def read_fstruct_value(self, s, position, reentrances, match): + return self.read_partial(s, position, reentrances) + + def read_str_value(self, s, position, reentrances, match): + return read_str(s, position) + + def read_int_value(self, s, position, reentrances, match): + return int(match.group()), match.end() + + # Note: the '?' is included in the variable name. + def read_var_value(self, s, position, reentrances, match): + return Variable(match.group()), match.end() + + _SYM_CONSTS = {"None": None, "True": True, "False": False} + + def read_sym_value(self, s, position, reentrances, match): + val, end = match.group(), match.end() + return self._SYM_CONSTS.get(val, val), end + + def read_app_value(self, s, position, reentrances, match): + """Mainly included for backwards compat.""" + return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end() + + def read_logic_value(self, s, position, reentrances, match): + try: + try: + expr = self._logic_parser.parse(match.group(1)) + except LogicalExpressionException as e: + raise ValueError from e + return expr, match.end() + except ValueError as e: + raise ValueError("logic expression", match.start(1)) from e + + def read_tuple_value(self, s, position, reentrances, match): + return self._read_seq_value( + s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat + ) + + def read_set_value(self, s, position, reentrances, match): + return self._read_seq_value( + s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion + ) + + def _read_seq_value( + self, s, position, reentrances, match, close_paren, seq_class, plus_class + ): + """ + Helper function used by read_tuple_value and read_set_value. + """ + cp = re.escape(close_paren) + position = match.end() + # Special syntax of empty tuples: + m = re.compile(r"\s*/?\s*%s" % cp).match(s, position) + if m: + return seq_class(), m.end() + # Read values: + values = [] + seen_plus = False + while True: + # Close paren: return value. + m = re.compile(r"\s*%s" % cp).match(s, position) + if m: + if seen_plus: + return plus_class(values), m.end() + else: + return seq_class(values), m.end() + + # Read the next value. + val, position = self.read_value(s, position, reentrances) + values.append(val) + + # Comma or looking at close paren + m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position) + if not m: + raise ValueError("',' or '+' or '%s'" % cp, position) + if m.group(1) == "+": + seen_plus = True + position = m.end() + + +###################################################################### +# { Demo +###################################################################### + + +def display_unification(fs1, fs2, indent=" "): + # Print the two input feature structures, side by side. + fs1_lines = ("%s" % fs1).split("\n") + fs2_lines = ("%s" % fs2).split("\n") + if len(fs1_lines) > len(fs2_lines): + blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]" + fs2_lines += [blankline] * len(fs1_lines) + else: + blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]" + fs1_lines += [blankline] * len(fs2_lines) + for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines): + print(indent + fs1_line + " " + fs2_line) + print(indent + "-" * len(fs1_lines[0]) + " " + "-" * len(fs2_lines[0])) + + linelen = len(fs1_lines[0]) * 2 + 3 + print(indent + "| |".center(linelen)) + print(indent + "+-----UNIFY-----+".center(linelen)) + print(indent + "|".center(linelen)) + print(indent + "V".center(linelen)) + + bindings = {} + + result = fs1.unify(fs2, bindings) + if result is None: + print(indent + "(FAILED)".center(linelen)) + else: + print( + "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n")) + ) + if bindings and len(bindings.bound_variables()) > 0: + print(repr(bindings).center(linelen)) + return result + + +def interactive_demo(trace=False): + import random + import sys + + HELP = """ + 1-%d: Select the corresponding feature structure + q: Quit + t: Turn tracing on or off + l: List all feature structures + ?: Help + """ + + print( + """ + This demo will repeatedly present you with a list of feature + structures, and ask you to choose two for unification. Whenever a + new feature structure is generated, it is added to the list of + choices that you can pick from. However, since this can be a + large number of feature structures, the demo will only print out a + random subset for you to choose between at a given time. If you + want to see the complete lists, type "l". For a list of valid + commands, type "?". + """ + ) + print('Press "Enter" to continue...') + sys.stdin.readline() + + fstruct_strings = [ + "[agr=[number=sing, gender=masc]]", + "[agr=[gender=masc, person=3]]", + "[agr=[gender=fem, person=3]]", + "[subj=[agr=(1)[]], agr->(1)]", + "[obj=?x]", + "[subj=?x]", + "[/=None]", + "[/=NP]", + "[cat=NP]", + "[cat=VP]", + "[cat=PP]", + "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", + "[gender=masc, agr=?C]", + "[gender=?S, agr=[gender=?S,person=3]]", + ] + + all_fstructs = [ + (i, FeatStruct(fstruct_strings[i])) for i in range(len(fstruct_strings)) + ] + + def list_fstructs(fstructs): + for i, fstruct in fstructs: + print() + lines = ("%s" % fstruct).split("\n") + print("%3d: %s" % (i + 1, lines[0])) + for line in lines[1:]: + print(" " + line) + print() + + while True: + # Pick 5 feature structures at random from the master list. + MAX_CHOICES = 5 + if len(all_fstructs) > MAX_CHOICES: + fstructs = sorted(random.sample(all_fstructs, MAX_CHOICES)) + else: + fstructs = all_fstructs + + print("_" * 75) + + print("Choose two feature structures to unify:") + list_fstructs(fstructs) + + selected = [None, None] + for (nth, i) in (("First", 0), ("Second", 1)): + while selected[i] is None: + print( + ( + "%s feature structure (1-%d,q,t,l,?): " + % (nth, len(all_fstructs)) + ), + end=" ", + ) + try: + input = sys.stdin.readline().strip() + if input in ("q", "Q", "x", "X"): + return + if input in ("t", "T"): + trace = not trace + print(" Trace = %s" % trace) + continue + if input in ("h", "H", "?"): + print(HELP % len(fstructs)) + continue + if input in ("l", "L"): + list_fstructs(all_fstructs) + continue + num = int(input) - 1 + selected[i] = all_fstructs[num][1] + print() + except: + print("Bad sentence number") + continue + + if trace: + result = selected[0].unify(selected[1], trace=1) + else: + result = display_unification(selected[0], selected[1]) + if result is not None: + for i, fstruct in all_fstructs: + if repr(result) == repr(fstruct): + break + else: + all_fstructs.append((len(all_fstructs), result)) + + print('\nType "Enter" to continue unifying; or "q" to quit.') + input = sys.stdin.readline().strip() + if input in ("q", "Q", "x", "X"): + return + + +def demo(trace=False): + """ + Just for testing + """ + # import random + + # processor breaks with values like '3rd' + fstruct_strings = [ + "[agr=[number=sing, gender=masc]]", + "[agr=[gender=masc, person=3]]", + "[agr=[gender=fem, person=3]]", + "[subj=[agr=(1)[]], agr->(1)]", + "[obj=?x]", + "[subj=?x]", + "[/=None]", + "[/=NP]", + "[cat=NP]", + "[cat=VP]", + "[cat=PP]", + "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", + "[gender=masc, agr=?C]", + "[gender=?S, agr=[gender=?S,person=3]]", + ] + all_fstructs = [FeatStruct(fss) for fss in fstruct_strings] + # MAX_CHOICES = 5 + # if len(all_fstructs) > MAX_CHOICES: + # fstructs = random.sample(all_fstructs, MAX_CHOICES) + # fstructs.sort() + # else: + # fstructs = all_fstructs + + for fs1 in all_fstructs: + for fs2 in all_fstructs: + print( + "\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" + % (fs1, fs2, unify(fs1, fs2)) + ) + + +if __name__ == "__main__": + demo() + +__all__ = [ + "FeatStruct", + "FeatDict", + "FeatList", + "unify", + "subsumes", + "conflicts", + "Feature", + "SlashFeature", + "RangeFeature", + "SLASH", + "TYPE", + "FeatStructReader", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/help.py b/.eggs/nltk-3.8-py3.10.egg/nltk/help.py new file mode 100644 index 0000000000000000000000000000000000000000..21529f8058b8292e0c1ddd656f18883e29b5ce39 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/help.py @@ -0,0 +1,64 @@ +# Natural Language Toolkit (NLTK) Help +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +Provide structured access to documentation. +""" + +import re +from textwrap import wrap + +from nltk.data import load + + +def brown_tagset(tagpattern=None): + _format_tagset("brown_tagset", tagpattern) + + +def claws5_tagset(tagpattern=None): + _format_tagset("claws5_tagset", tagpattern) + + +def upenn_tagset(tagpattern=None): + _format_tagset("upenn_tagset", tagpattern) + + +##################################################################### +# UTILITIES +##################################################################### + + +def _print_entries(tags, tagdict): + for tag in tags: + entry = tagdict[tag] + defn = [tag + ": " + entry[0]] + examples = wrap( + entry[1], width=75, initial_indent=" ", subsequent_indent=" " + ) + print("\n".join(defn + examples)) + + +def _format_tagset(tagset, tagpattern=None): + tagdict = load("help/tagsets/" + tagset + ".pickle") + if not tagpattern: + _print_entries(sorted(tagdict), tagdict) + elif tagpattern in tagdict: + _print_entries([tagpattern], tagdict) + else: + tagpattern = re.compile(tagpattern) + tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] + if tags: + _print_entries(tags, tagdict) + else: + print("No matching tags found.") + + +if __name__ == "__main__": + brown_tagset(r"NN.*") + upenn_tagset(r".*\$") + claws5_tagset("UNDEFINED") + brown_tagset(r"NN") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/discourse.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/discourse.py new file mode 100644 index 0000000000000000000000000000000000000000..9630234dcf3837d9da2b4213fe26d22491899932 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/discourse.py @@ -0,0 +1,651 @@ +# Natural Language Toolkit: Discourse Processing +# +# Author: Ewan Klein +# Dan Garrette +# +# URL: +# For license information, see LICENSE.TXT + +r""" +Module for incrementally developing simple discourses, and checking for semantic ambiguity, +consistency and informativeness. + +Many of the ideas are based on the CURT family of programs of Blackburn and Bos +(see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html). + +Consistency checking is carried out by using the ``mace`` module to call the Mace4 model builder. +Informativeness checking is carried out with a call to ``Prover.prove()`` from +the ``inference`` module. + +``DiscourseTester`` is a constructor for discourses. +The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list +is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example:: + + s0: A boxer walks + s1: Every boxer chases a girl + +Each sentence can be ambiguous between a number of readings, each of which receives a +"reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example:: + + s0 readings: + + s0-r1: some x.(boxer(x) & walk(x)) + s0-r0: some x.(boxerdog(x) & walk(x)) + +A "thread" is a list of readings, represented as a list of ``rid``\ s. +Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*. +For example:: + + d0: ['s0-r0', 's1-r0'] + +The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. +(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show +those threads which are consistent (taking into account any background assumptions). +""" + +import os +from abc import ABCMeta, abstractmethod +from functools import reduce +from operator import add, and_ + +from nltk.data import show_cfg +from nltk.inference.mace import MaceCommand +from nltk.inference.prover9 import Prover9Command +from nltk.parse import load_parser +from nltk.parse.malt import MaltParser +from nltk.sem.drt import AnaphoraResolutionException, resolve_anaphora +from nltk.sem.glue import DrtGlue +from nltk.sem.logic import Expression +from nltk.tag import RegexpTagger + + +class ReadingCommand(metaclass=ABCMeta): + @abstractmethod + def parse_to_readings(self, sentence): + """ + :param sentence: the sentence to read + :type sentence: str + """ + + def process_thread(self, sentence_readings): + """ + This method should be used to handle dependencies between readings such + as resolving anaphora. + + :param sentence_readings: readings to process + :type sentence_readings: list(Expression) + :return: the list of readings after processing + :rtype: list(Expression) + """ + return sentence_readings + + @abstractmethod + def combine_readings(self, readings): + """ + :param readings: readings to combine + :type readings: list(Expression) + :return: one combined reading + :rtype: Expression + """ + + @abstractmethod + def to_fol(self, expression): + """ + Convert this expression into a First-Order Logic expression. + + :param expression: an expression + :type expression: Expression + :return: a FOL version of the input expression + :rtype: Expression + """ + + +class CfgReadingCommand(ReadingCommand): + def __init__(self, gramfile=None): + """ + :param gramfile: name of file where grammar can be loaded + :type gramfile: str + """ + self._gramfile = ( + gramfile if gramfile else "grammars/book_grammars/discourse.fcfg" + ) + self._parser = load_parser(self._gramfile) + + def parse_to_readings(self, sentence): + """:see: ReadingCommand.parse_to_readings()""" + from nltk.sem import root_semrep + + tokens = sentence.split() + trees = self._parser.parse(tokens) + return [root_semrep(tree) for tree in trees] + + def combine_readings(self, readings): + """:see: ReadingCommand.combine_readings()""" + return reduce(and_, readings) + + def to_fol(self, expression): + """:see: ReadingCommand.to_fol()""" + return expression + + +class DrtGlueReadingCommand(ReadingCommand): + def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None): + """ + :param semtype_file: name of file where grammar can be loaded + :param remove_duplicates: should duplicates be removed? + :param depparser: the dependency parser + """ + if semtype_file is None: + semtype_file = os.path.join( + "grammars", "sample_grammars", "drt_glue.semtype" + ) + self._glue = DrtGlue( + semtype_file=semtype_file, + remove_duplicates=remove_duplicates, + depparser=depparser, + ) + + def parse_to_readings(self, sentence): + """:see: ReadingCommand.parse_to_readings()""" + return self._glue.parse_to_meaning(sentence) + + def process_thread(self, sentence_readings): + """:see: ReadingCommand.process_thread()""" + try: + return [self.combine_readings(sentence_readings)] + except AnaphoraResolutionException: + return [] + + def combine_readings(self, readings): + """:see: ReadingCommand.combine_readings()""" + thread_reading = reduce(add, readings) + return resolve_anaphora(thread_reading.simplify()) + + def to_fol(self, expression): + """:see: ReadingCommand.to_fol()""" + return expression.fol() + + +class DiscourseTester: + """ + Check properties of an ongoing discourse. + """ + + def __init__(self, input, reading_command=None, background=None): + """ + Initialize a ``DiscourseTester``. + + :param input: the discourse sentences + :type input: list of str + :param background: Formulas which express background assumptions + :type background: list(Expression) + """ + self._input = input + self._sentences = {"s%s" % i: sent for i, sent in enumerate(input)} + self._models = None + self._readings = {} + self._reading_command = ( + reading_command if reading_command else CfgReadingCommand() + ) + self._threads = {} + self._filtered_threads = {} + if background is not None: + from nltk.sem.logic import Expression + + for e in background: + assert isinstance(e, Expression) + self._background = background + else: + self._background = [] + + ############################### + # Sentences + ############################### + + def sentences(self): + """ + Display the list of sentences in the current discourse. + """ + for id in sorted(self._sentences): + print(f"{id}: {self._sentences[id]}") + + def add_sentence(self, sentence, informchk=False, consistchk=False): + """ + Add a sentence to the current discourse. + + Updates ``self._input`` and ``self._sentences``. + :param sentence: An input sentence + :type sentence: str + :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``. + :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``. + + """ + # check whether the new sentence is informative (i.e. not entailed by the previous discourse) + if informchk: + self.readings(verbose=False) + for tid in sorted(self._threads): + assumptions = [reading for (rid, reading) in self.expand_threads(tid)] + assumptions += self._background + for sent_reading in self._get_readings(sentence): + tp = Prover9Command(goal=sent_reading, assumptions=assumptions) + if tp.prove(): + print( + "Sentence '%s' under reading '%s':" + % (sentence, str(sent_reading)) + ) + print("Not informative relative to thread '%s'" % tid) + + self._input.append(sentence) + self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} + # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of + # of assumptions + if consistchk: + self.readings(verbose=False) + self.models(show=False) + + def retract_sentence(self, sentence, verbose=True): + """ + Remove a sentence from the current discourse. + + Updates ``self._input``, ``self._sentences`` and ``self._readings``. + :param sentence: An input sentence + :type sentence: str + :param verbose: If ``True``, report on the updated list of sentences. + """ + try: + self._input.remove(sentence) + except ValueError: + print( + "Retraction failed. The sentence '%s' is not part of the current discourse:" + % sentence + ) + self.sentences() + return None + self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} + self.readings(verbose=False) + if verbose: + print("Current sentences are ") + self.sentences() + + def grammar(self): + """ + Print out the grammar in use for parsing input sentences + """ + show_cfg(self._reading_command._gramfile) + + ############################### + # Readings and Threads + ############################### + + def _get_readings(self, sentence): + """ + Build a list of semantic readings for a sentence. + + :rtype: list(Expression) + """ + return self._reading_command.parse_to_readings(sentence) + + def _construct_readings(self): + """ + Use ``self._sentences`` to construct a value for ``self._readings``. + """ + # re-initialize self._readings in case we have retracted a sentence + self._readings = {} + for sid in sorted(self._sentences): + sentence = self._sentences[sid] + readings = self._get_readings(sentence) + self._readings[sid] = { + f"{sid}-r{rid}": reading.simplify() + for rid, reading in enumerate(sorted(readings, key=str)) + } + + def _construct_threads(self): + """ + Use ``self._readings`` to construct a value for ``self._threads`` + and use the model builder to construct a value for ``self._filtered_threads`` + """ + thread_list = [[]] + for sid in sorted(self._readings): + thread_list = self.multiply(thread_list, sorted(self._readings[sid])) + self._threads = {"d%s" % tid: thread for tid, thread in enumerate(thread_list)} + # re-initialize the filtered threads + self._filtered_threads = {} + # keep the same ids, but only include threads which get models + consistency_checked = self._check_consistency(self._threads) + for (tid, thread) in self._threads.items(): + if (tid, True) in consistency_checked: + self._filtered_threads[tid] = thread + + def _show_readings(self, sentence=None): + """ + Print out the readings for the discourse (or a single sentence). + """ + if sentence is not None: + print("The sentence '%s' has these readings:" % sentence) + for r in [str(reading) for reading in (self._get_readings(sentence))]: + print(" %s" % r) + else: + for sid in sorted(self._readings): + print() + print("%s readings:" % sid) + print() #'-' * 30 + for rid in sorted(self._readings[sid]): + lf = self._readings[sid][rid] + print(f"{rid}: {lf.normalize()}") + + def _show_threads(self, filter=False, show_thread_readings=False): + """ + Print out the value of ``self._threads`` or ``self._filtered_hreads`` + """ + threads = self._filtered_threads if filter else self._threads + for tid in sorted(threads): + if show_thread_readings: + readings = [ + self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid] + ] + try: + thread_reading = ( + ": %s" + % self._reading_command.combine_readings(readings).normalize() + ) + except Exception as e: + thread_reading = ": INVALID: %s" % e.__class__.__name__ + else: + thread_reading = "" + + print("%s:" % tid, self._threads[tid], thread_reading) + + def readings( + self, + sentence=None, + threaded=False, + verbose=True, + filter=False, + show_thread_readings=False, + ): + """ + Construct and show the readings of the discourse (or of a single sentence). + + :param sentence: test just this sentence + :type sentence: str + :param threaded: if ``True``, print out each thread ID and the corresponding thread. + :param filter: if ``True``, only print out consistent thread IDs and threads. + """ + self._construct_readings() + self._construct_threads() + + # if we are filtering or showing thread readings, show threads + if filter or show_thread_readings: + threaded = True + + if verbose: + if not threaded: + self._show_readings(sentence=sentence) + else: + self._show_threads( + filter=filter, show_thread_readings=show_thread_readings + ) + + def expand_threads(self, thread_id, threads=None): + """ + Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread. + + :param thread_id: thread ID + :type thread_id: str + :param threads: a mapping from thread IDs to lists of reading IDs + :type threads: dict + :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID + :rtype: list of tuple + """ + if threads is None: + threads = self._threads + return [ + (rid, self._readings[sid][rid]) + for rid in threads[thread_id] + for sid in rid.split("-")[:1] + ] + + ############################### + # Models and Background + ############################### + + def _check_consistency(self, threads, show=False, verbose=False): + results = [] + for tid in sorted(threads): + assumptions = [ + reading for (rid, reading) in self.expand_threads(tid, threads=threads) + ] + assumptions = list( + map( + self._reading_command.to_fol, + self._reading_command.process_thread(assumptions), + ) + ) + if assumptions: + assumptions += self._background + # if Mace4 finds a model, it always seems to find it quickly + mb = MaceCommand(None, assumptions, max_models=20) + modelfound = mb.build_model() + else: + modelfound = False + results.append((tid, modelfound)) + if show: + spacer(80) + print("Model for Discourse Thread %s" % tid) + spacer(80) + if verbose: + for a in assumptions: + print(a) + spacer(80) + if modelfound: + print(mb.model(format="cooked")) + else: + print("No model found!\n") + return results + + def models(self, thread_id=None, show=True, verbose=False): + """ + Call Mace4 to build a model for each current discourse thread. + + :param thread_id: thread ID + :type thread_id: str + :param show: If ``True``, display the model that has been found. + """ + self._construct_readings() + self._construct_threads() + threads = {thread_id: self._threads[thread_id]} if thread_id else self._threads + + for (tid, modelfound) in self._check_consistency( + threads, show=show, verbose=verbose + ): + idlist = [rid for rid in threads[tid]] + + if not modelfound: + print(f"Inconsistent discourse: {tid} {idlist}:") + for rid, reading in self.expand_threads(tid): + print(f" {rid}: {reading.normalize()}") + print() + else: + print(f"Consistent discourse: {tid} {idlist}:") + for rid, reading in self.expand_threads(tid): + print(f" {rid}: {reading.normalize()}") + print() + + def add_background(self, background, verbose=False): + """ + Add a list of background assumptions for reasoning about the discourse. + + When called, this method also updates the discourse model's set of readings and threads. + :param background: Formulas which contain background information + :type background: list(Expression) + """ + from nltk.sem.logic import Expression + + for (count, e) in enumerate(background): + assert isinstance(e, Expression) + if verbose: + print("Adding assumption %s to background" % count) + self._background.append(e) + + # update the state + self._construct_readings() + self._construct_threads() + + def background(self): + """ + Show the current background assumptions. + """ + for e in self._background: + print(str(e)) + + ############################### + # Misc + ############################### + + @staticmethod + def multiply(discourse, readings): + """ + Multiply every thread in ``discourse`` by every reading in ``readings``. + + Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns + [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] + + :param discourse: the current list of readings + :type discourse: list of lists + :param readings: an additional list of readings + :type readings: list(Expression) + :rtype: A list of lists + """ + result = [] + for sublist in discourse: + for r in readings: + new = [] + new += sublist + new.append(r) + result.append(new) + return result + + +def load_fol(s): + """ + Temporarily duplicated from ``nltk.sem.util``. + Convert a file of first order formulas into a list of ``Expression`` objects. + + :param s: the contents of the file + :type s: str + :return: a list of parsed formulas. + :rtype: list(Expression) + """ + statements = [] + for linenum, line in enumerate(s.splitlines()): + line = line.strip() + if line.startswith("#") or line == "": + continue + try: + statements.append(Expression.fromstring(line)) + except Exception as e: + raise ValueError(f"Unable to parse line {linenum}: {line}") from e + return statements + + +############################### +# Demo +############################### +def discourse_demo(reading_command=None): + """ + Illustrate the various methods of ``DiscourseTester`` + """ + dt = DiscourseTester( + ["A boxer walks", "Every boxer chases a girl"], reading_command + ) + dt.models() + print() + # dt.grammar() + print() + dt.sentences() + print() + dt.readings() + print() + dt.readings(threaded=True) + print() + dt.models("d1") + dt.add_sentence("John is a boxer") + print() + dt.sentences() + print() + dt.readings(threaded=True) + print() + dt = DiscourseTester( + ["A student dances", "Every student is a person"], reading_command + ) + print() + dt.add_sentence("No person dances", consistchk=True) + print() + dt.readings() + print() + dt.retract_sentence("No person dances", verbose=True) + print() + dt.models() + print() + dt.readings("A person dances") + print() + dt.add_sentence("A person dances", informchk=True) + dt = DiscourseTester( + ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"], + reading_command, + ) + dt.readings(filter=True) + import nltk.data + + background_file = os.path.join("grammars", "book_grammars", "background.fol") + background = nltk.data.load(background_file) + + print() + dt.add_background(background, verbose=False) + dt.background() + print() + dt.readings(filter=True) + print() + dt.models() + + +def drt_discourse_demo(reading_command=None): + """ + Illustrate the various methods of ``DiscourseTester`` + """ + dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command) + dt.models() + print() + dt.sentences() + print() + dt.readings() + print() + dt.readings(show_thread_readings=True) + print() + dt.readings(filter=True, show_thread_readings=True) + + +def spacer(num=30): + print("-" * num) + + +def demo(): + discourse_demo() + + tagger = RegexpTagger( + [ + ("^(chases|runs)$", "VB"), + ("^(a)$", "ex_quant"), + ("^(every)$", "univ_quant"), + ("^(dog|boy)$", "NN"), + ("^(he)$", "PRP"), + ] + ) + depparser = MaltParser(tagger=tagger) + drt_discourse_demo( + DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser) + ) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/mace.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/mace.py new file mode 100644 index 0000000000000000000000000000000000000000..ee4d9e8e38d7db34c4b58f9c37dee330d397e123 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/mace.py @@ -0,0 +1,383 @@ +# Natural Language Toolkit: Interface to the Mace4 Model Builder +# +# Author: Dan Garrette +# Ewan Klein + +# URL: +# For license information, see LICENSE.TXT + +""" +A model builder that makes use of the external 'Mace4' package. +""" + +import os +import tempfile + +from nltk.inference.api import BaseModelBuilderCommand, ModelBuilder +from nltk.inference.prover9 import Prover9CommandParent, Prover9Parent +from nltk.sem import Expression, Valuation +from nltk.sem.logic import is_indvar + + +class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand): + """ + A ``MaceCommand`` specific to the ``Mace`` model builder. It contains + a print_assumptions() method that is used to print the list + of assumptions in multiple formats. + """ + + _interpformat_bin = None + + def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in + the proof. + :type assumptions: list(sem.Expression) + :param max_models: The maximum number of models that Mace will try before + simply returning false. (Use 0 for no maximum.) + :type max_models: int + """ + if model_builder is not None: + assert isinstance(model_builder, Mace) + else: + model_builder = Mace(max_models) + + BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions) + + @property + def valuation(mbc): + return mbc.model("valuation") + + def _convert2val(self, valuation_str): + """ + Transform the output file into an NLTK-style Valuation. + + :return: A model if one is generated; None otherwise. + :rtype: sem.Valuation + """ + valuation_standard_format = self._transform_output(valuation_str, "standard") + + val = [] + for line in valuation_standard_format.splitlines(False): + l = line.strip() + + if l.startswith("interpretation"): + # find the number of entities in the model + num_entities = int(l[l.index("(") + 1 : l.index(",")].strip()) + + elif l.startswith("function") and l.find("_") == -1: + # replace the integer identifier with a corresponding alphabetic character + name = l[l.index("(") + 1 : l.index(",")].strip() + if is_indvar(name): + name = name.upper() + value = int(l[l.index("[") + 1 : l.index("]")].strip()) + val.append((name, MaceCommand._make_model_var(value))) + + elif l.startswith("relation"): + l = l[l.index("(") + 1 :] + if "(" in l: + # relation is not nullary + name = l[: l.index("(")].strip() + values = [ + int(v.strip()) + for v in l[l.index("[") + 1 : l.index("]")].split(",") + ] + val.append( + (name, MaceCommand._make_relation_set(num_entities, values)) + ) + else: + # relation is nullary + name = l[: l.index(",")].strip() + value = int(l[l.index("[") + 1 : l.index("]")].strip()) + val.append((name, value == 1)) + + return Valuation(val) + + @staticmethod + def _make_relation_set(num_entities, values): + """ + Convert a Mace4-style relation table into a dictionary. + + :param num_entities: the number of entities in the model; determines the row length in the table. + :type num_entities: int + :param values: a list of 1's and 0's that represent whether a relation holds in a Mace4 model. + :type values: list of int + """ + r = set() + for position in [pos for (pos, v) in enumerate(values) if v == 1]: + r.add( + tuple(MaceCommand._make_relation_tuple(position, values, num_entities)) + ) + return r + + @staticmethod + def _make_relation_tuple(position, values, num_entities): + if len(values) == 1: + return [] + else: + sublist_size = len(values) // num_entities + sublist_start = position // sublist_size + sublist_position = int(position % sublist_size) + + sublist = values[ + sublist_start * sublist_size : (sublist_start + 1) * sublist_size + ] + return [ + MaceCommand._make_model_var(sublist_start) + ] + MaceCommand._make_relation_tuple( + sublist_position, sublist, num_entities + ) + + @staticmethod + def _make_model_var(value): + """ + Pick an alphabetic character as identifier for an entity in the model. + + :param value: where to index into the list of characters + :type value: int + """ + letter = [ + "a", + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + ][value] + num = value // 26 + return letter + str(num) if num > 0 else letter + + def _decorate_model(self, valuation_str, format): + """ + Print out a Mace4 model using any Mace4 ``interpformat`` format. + See https://www.cs.unm.edu/~mccune/mace4/manual/ for details. + + :param valuation_str: str with the model builder's output + :param format: str indicating the format for displaying + models. Defaults to 'standard' format. + :return: str + """ + if not format: + return valuation_str + elif format == "valuation": + return self._convert2val(valuation_str) + else: + return self._transform_output(valuation_str, format) + + def _transform_output(self, valuation_str, format): + """ + Transform the output file into any Mace4 ``interpformat`` format. + + :param format: Output format for displaying models. + :type format: str + """ + if format in [ + "standard", + "standard2", + "portable", + "tabular", + "raw", + "cooked", + "xml", + "tex", + ]: + return self._call_interpformat(valuation_str, [format])[0] + else: + raise LookupError("The specified format does not exist") + + def _call_interpformat(self, input_str, args=[], verbose=False): + """ + Call the ``interpformat`` binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param args: A list of command-line arguments. + :return: A tuple (stdout, returncode) + :see: ``config_prover9`` + """ + if self._interpformat_bin is None: + self._interpformat_bin = self._modelbuilder._find_binary( + "interpformat", verbose + ) + + return self._modelbuilder._call( + input_str, self._interpformat_bin, args, verbose + ) + + +class Mace(Prover9Parent, ModelBuilder): + _mace4_bin = None + + def __init__(self, end_size=500): + self._end_size = end_size + """The maximum model size that Mace will try before + simply returning false. (Use -1 for no maximum.)""" + + def _build_model(self, goal=None, assumptions=None, verbose=False): + """ + Use Mace4 to build a first order model. + + :return: ``True`` if a model was found (i.e. Mace returns value of 0), + else ``False`` + """ + if not assumptions: + assumptions = [] + + stdout, returncode = self._call_mace4( + self.prover9_input(goal, assumptions), verbose=verbose + ) + return (returncode == 0, stdout) + + def _call_mace4(self, input_str, args=[], verbose=False): + """ + Call the ``mace4`` binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param args: A list of command-line arguments. + :return: A tuple (stdout, returncode) + :see: ``config_prover9`` + """ + if self._mace4_bin is None: + self._mace4_bin = self._find_binary("mace4", verbose) + + updated_input_str = "" + if self._end_size > 0: + updated_input_str += "assign(end_size, %d).\n\n" % self._end_size + updated_input_str += input_str + + return self._call(updated_input_str, self._mace4_bin, args, verbose) + + +def spacer(num=30): + print("-" * num) + + +def decode_result(found): + """ + Decode the result of model_found() + + :param found: The output of model_found() + :type found: bool + """ + return {True: "Countermodel found", False: "No countermodel found", None: "None"}[ + found + ] + + +def test_model_found(arguments): + """ + Try some proofs and exhibit the results. + """ + for (goal, assumptions) in arguments: + g = Expression.fromstring(goal) + alist = [lp.parse(a) for a in assumptions] + m = MaceCommand(g, assumptions=alist, max_models=50) + found = m.build_model() + for a in alist: + print(" %s" % a) + print(f"|- {g}: {decode_result(found)}\n") + + +def test_build_model(arguments): + """ + Try to build a ``nltk.sem.Valuation``. + """ + g = Expression.fromstring("all x.man(x)") + alist = [ + Expression.fromstring(a) + for a in [ + "man(John)", + "man(Socrates)", + "man(Bill)", + "some x.(-(x = John) & man(x) & sees(John,x))", + "some x.(-(x = Bill) & man(x))", + "all x.some y.(man(x) -> gives(Socrates,x,y))", + ] + ] + + m = MaceCommand(g, assumptions=alist) + m.build_model() + spacer() + print("Assumptions and Goal") + spacer() + for a in alist: + print(" %s" % a) + print(f"|- {g}: {decode_result(m.build_model())}\n") + spacer() + # print(m.model('standard')) + # print(m.model('cooked')) + print("Valuation") + spacer() + print(m.valuation, "\n") + + +def test_transform_output(argument_pair): + """ + Transform the model into various Mace4 ``interpformat`` formats. + """ + g = Expression.fromstring(argument_pair[0]) + alist = [lp.parse(a) for a in argument_pair[1]] + m = MaceCommand(g, assumptions=alist) + m.build_model() + for a in alist: + print(" %s" % a) + print(f"|- {g}: {m.build_model()}\n") + for format in ["standard", "portable", "xml", "cooked"]: + spacer() + print("Using '%s' format" % format) + spacer() + print(m.model(format=format)) + + +def test_make_relation_set(): + print( + MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1]) + == {("c",), ("a",)} + ) + print( + MaceCommand._make_relation_set( + num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0] + ) + == {("c", "a")} + ) + print( + MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0]) + == {("a", "b", "a"), ("b", "b", "a")} + ) + + +arguments = [ + ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), + ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), +] + + +def demo(): + test_model_found(arguments) + test_build_model(arguments) + test_transform_output(arguments[1]) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/prover9.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/prover9.py new file mode 100644 index 0000000000000000000000000000000000000000..c05baecba133ba3a992ccc304f44f244793c990c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/prover9.py @@ -0,0 +1,508 @@ +# Natural Language Toolkit: Interface to the Prover9 Theorem Prover +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Dan Garrette +# Ewan Klein +# +# URL: +# For license information, see LICENSE.TXT +""" +A theorem prover that makes use of the external 'Prover9' package. +""" + +import os +import subprocess + +import nltk +from nltk.inference.api import BaseProverCommand, Prover +from nltk.sem.logic import ( + AllExpression, + AndExpression, + EqualityExpression, + ExistsExpression, + Expression, + IffExpression, + ImpExpression, + NegatedExpression, + OrExpression, +) + +# +# Following is not yet used. Return code for 2 actually realized as 512. +# +p9_return_codes = { + 0: True, + 1: "(FATAL)", # A fatal error occurred (user's syntax error). + 2: False, # (SOS_EMPTY) Prover9 ran out of things to do + # (sos list exhausted). + 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded. + 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded. + 5: "(MAX_GIVEN)", # The max_given parameter was exceeded. + 6: "(MAX_KEPT)", # The max_kept parameter was exceeded. + 7: "(ACTION)", # A Prover9 action terminated the search. + 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug. +} + + +class Prover9CommandParent: + """ + A common base class used by both ``Prover9Command`` and ``MaceCommand``, + which is responsible for maintaining a goal and a set of assumptions, + and generating prover9-style input files from them. + """ + + def print_assumptions(self, output_format="nltk"): + """ + Print the list of the current assumptions. + """ + if output_format.lower() == "nltk": + for a in self.assumptions(): + print(a) + elif output_format.lower() == "prover9": + for a in convert_to_prover9(self.assumptions()): + print(a) + else: + raise NameError( + "Unrecognized value for 'output_format': %s" % output_format + ) + + +class Prover9Command(Prover9CommandParent, BaseProverCommand): + """ + A ``ProverCommand`` specific to the ``Prover9`` prover. It contains + the a print_assumptions() method that is used to print the list + of assumptions in multiple formats. + """ + + def __init__(self, goal=None, assumptions=None, timeout=60, prover=None): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in + the proof. + :type assumptions: list(sem.Expression) + :param timeout: number of seconds before timeout; set to 0 for + no timeout. + :type timeout: int + :param prover: a prover. If not set, one will be created. + :type prover: Prover9 + """ + if not assumptions: + assumptions = [] + + if prover is not None: + assert isinstance(prover, Prover9) + else: + prover = Prover9(timeout) + + BaseProverCommand.__init__(self, prover, goal, assumptions) + + def decorate_proof(self, proof_string, simplify=True): + """ + :see BaseProverCommand.decorate_proof() + """ + if simplify: + return self._prover._call_prooftrans(proof_string, ["striplabels"])[ + 0 + ].rstrip() + else: + return proof_string.rstrip() + + +class Prover9Parent: + """ + A common class extended by both ``Prover9`` and ``Mace ``. + It contains the functionality required to convert NLTK-style + expressions into Prover9-style expressions. + """ + + _binary_location = None + + def config_prover9(self, binary_location, verbose=False): + if binary_location is None: + self._binary_location = None + self._prover9_bin = None + else: + name = "prover9" + self._prover9_bin = nltk.internals.find_binary( + name, + path_to_bin=binary_location, + env_vars=["PROVER9"], + url="https://www.cs.unm.edu/~mccune/prover9/", + binary_names=[name, name + ".exe"], + verbose=verbose, + ) + self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1) + + def prover9_input(self, goal, assumptions): + """ + :return: The input string that should be provided to the + prover9 binary. This string is formed based on the goal, + assumptions, and timeout value of this object. + """ + s = "" + + if assumptions: + s += "formulas(assumptions).\n" + for p9_assumption in convert_to_prover9(assumptions): + s += " %s.\n" % p9_assumption + s += "end_of_list.\n\n" + + if goal: + s += "formulas(goals).\n" + s += " %s.\n" % convert_to_prover9(goal) + s += "end_of_list.\n\n" + + return s + + def binary_locations(self): + """ + A list of directories that should be searched for the prover9 + executables. This list is used by ``config_prover9`` when searching + for the prover9 executables. + """ + return [ + "/usr/local/bin/prover9", + "/usr/local/bin/prover9/bin", + "/usr/local/bin", + "/usr/bin", + "/usr/local/prover9", + "/usr/local/share/prover9", + ] + + def _find_binary(self, name, verbose=False): + binary_locations = self.binary_locations() + if self._binary_location is not None: + binary_locations += [self._binary_location] + return nltk.internals.find_binary( + name, + searchpath=binary_locations, + env_vars=["PROVER9"], + url="https://www.cs.unm.edu/~mccune/prover9/", + binary_names=[name, name + ".exe"], + verbose=verbose, + ) + + def _call(self, input_str, binary, args=[], verbose=False): + """ + Call the binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param binary: The location of the binary to call + :param args: A list of command-line arguments. + :return: A tuple (stdout, returncode) + :see: ``config_prover9`` + """ + if verbose: + print("Calling:", binary) + print("Args:", args) + print("Input:\n", input_str, "\n") + + # Call prover9 via a subprocess + cmd = [binary] + args + try: + input_str = input_str.encode("utf8") + except AttributeError: + pass + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE + ) + (stdout, stderr) = p.communicate(input=input_str) + + if verbose: + print("Return code:", p.returncode) + if stdout: + print("stdout:\n", stdout, "\n") + if stderr: + print("stderr:\n", stderr, "\n") + + return (stdout.decode("utf-8"), p.returncode) + + +def convert_to_prover9(input): + """ + Convert a ``logic.Expression`` to Prover9 format. + """ + if isinstance(input, list): + result = [] + for s in input: + try: + result.append(_convert_to_prover9(s.simplify())) + except: + print("input %s cannot be converted to Prover9 input syntax" % input) + raise + return result + else: + try: + return _convert_to_prover9(input.simplify()) + except: + print("input %s cannot be converted to Prover9 input syntax" % input) + raise + + +def _convert_to_prover9(expression): + """ + Convert ``logic.Expression`` to Prover9 formatted string. + """ + if isinstance(expression, ExistsExpression): + return ( + "exists " + + str(expression.variable) + + " " + + _convert_to_prover9(expression.term) + ) + elif isinstance(expression, AllExpression): + return ( + "all " + + str(expression.variable) + + " " + + _convert_to_prover9(expression.term) + ) + elif isinstance(expression, NegatedExpression): + return "-(" + _convert_to_prover9(expression.term) + ")" + elif isinstance(expression, AndExpression): + return ( + "(" + + _convert_to_prover9(expression.first) + + " & " + + _convert_to_prover9(expression.second) + + ")" + ) + elif isinstance(expression, OrExpression): + return ( + "(" + + _convert_to_prover9(expression.first) + + " | " + + _convert_to_prover9(expression.second) + + ")" + ) + elif isinstance(expression, ImpExpression): + return ( + "(" + + _convert_to_prover9(expression.first) + + " -> " + + _convert_to_prover9(expression.second) + + ")" + ) + elif isinstance(expression, IffExpression): + return ( + "(" + + _convert_to_prover9(expression.first) + + " <-> " + + _convert_to_prover9(expression.second) + + ")" + ) + elif isinstance(expression, EqualityExpression): + return ( + "(" + + _convert_to_prover9(expression.first) + + " = " + + _convert_to_prover9(expression.second) + + ")" + ) + else: + return str(expression) + + +class Prover9(Prover9Parent, Prover): + _prover9_bin = None + _prooftrans_bin = None + + def __init__(self, timeout=60): + self._timeout = timeout + """The timeout value for prover9. If a proof can not be found + in this amount of time, then prover9 will return false. + (Use 0 for no timeout.)""" + + def _prove(self, goal=None, assumptions=None, verbose=False): + """ + Use Prover9 to prove a theorem. + :return: A pair whose first element is a boolean indicating if the + proof was successful (i.e. returns value of 0) and whose second element + is the output of the prover. + """ + if not assumptions: + assumptions = [] + + stdout, returncode = self._call_prover9( + self.prover9_input(goal, assumptions), verbose=verbose + ) + return (returncode == 0, stdout) + + def prover9_input(self, goal, assumptions): + """ + :see: Prover9Parent.prover9_input + """ + s = "clear(auto_denials).\n" # only one proof required + return s + Prover9Parent.prover9_input(self, goal, assumptions) + + def _call_prover9(self, input_str, args=[], verbose=False): + """ + Call the ``prover9`` binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param args: A list of command-line arguments. + :return: A tuple (stdout, returncode) + :see: ``config_prover9`` + """ + if self._prover9_bin is None: + self._prover9_bin = self._find_binary("prover9", verbose) + + updated_input_str = "" + if self._timeout > 0: + updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout + updated_input_str += input_str + + stdout, returncode = self._call( + updated_input_str, self._prover9_bin, args, verbose + ) + + if returncode not in [0, 2]: + errormsgprefix = "%%ERROR:" + if errormsgprefix in stdout: + msgstart = stdout.index(errormsgprefix) + errormsg = stdout[msgstart:].strip() + else: + errormsg = None + if returncode in [3, 4, 5, 6]: + raise Prover9LimitExceededException(returncode, errormsg) + else: + raise Prover9FatalException(returncode, errormsg) + + return stdout, returncode + + def _call_prooftrans(self, input_str, args=[], verbose=False): + """ + Call the ``prooftrans`` binary with the given input. + + :param input_str: A string whose contents are used as stdin. + :param args: A list of command-line arguments. + :return: A tuple (stdout, returncode) + :see: ``config_prover9`` + """ + if self._prooftrans_bin is None: + self._prooftrans_bin = self._find_binary("prooftrans", verbose) + + return self._call(input_str, self._prooftrans_bin, args, verbose) + + +class Prover9Exception(Exception): + def __init__(self, returncode, message): + msg = p9_return_codes[returncode] + if message: + msg += "\n%s" % message + Exception.__init__(self, msg) + + +class Prover9FatalException(Prover9Exception): + pass + + +class Prover9LimitExceededException(Prover9Exception): + pass + + +###################################################################### +# { Tests and Demos +###################################################################### + + +def test_config(): + + a = Expression.fromstring("(walk(j) & sing(j))") + g = Expression.fromstring("walk(j)") + p = Prover9Command(g, assumptions=[a]) + p._executable_path = None + p.prover9_search = [] + p.prove() + # config_prover9('/usr/local/bin') + print(p.prove()) + print(p.proof()) + + +def test_convert_to_prover9(expr): + """ + Test that parsing works OK. + """ + for t in expr: + e = Expression.fromstring(t) + print(convert_to_prover9(e)) + + +def test_prove(arguments): + """ + Try some proofs and exhibit the results. + """ + for (goal, assumptions) in arguments: + g = Expression.fromstring(goal) + alist = [Expression.fromstring(a) for a in assumptions] + p = Prover9Command(g, assumptions=alist).prove() + for a in alist: + print(" %s" % a) + print(f"|- {g}: {p}\n") + + +arguments = [ + ("(man(x) <-> (not (not man(x))))", []), + ("(not (man(x) & (not man(x))))", []), + ("(man(x) | (not man(x)))", []), + ("(man(x) & (not man(x)))", []), + ("(man(x) -> man(x))", []), + ("(not (man(x) & (not man(x))))", []), + ("(man(x) | (not man(x)))", []), + ("(man(x) -> man(x))", []), + ("(man(x) <-> man(x))", []), + ("(not (man(x) <-> (not man(x))))", []), + ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), + ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []), + ("(all x.man(x) -> all x.man(x))", []), + ("some x.all y.sees(x,y)", []), + ( + "some e3.(walk(e3) & subj(e3, mary))", + [ + "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" + ], + ), + ( + "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))", + [ + "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" + ], + ), +] + +expressions = [ + r"some x y.sees(x,y)", + r"some x.(man(x) & walks(x))", + r"\x.(man(x) & walks(x))", + r"\x y.sees(x,y)", + r"walks(john)", + r"\x.big(x, \y.mouse(y))", + r"(walks(x) & (runs(x) & (threes(x) & fours(x))))", + r"(walks(x) -> runs(x))", + r"some x.(PRO(x) & sees(John, x))", + r"some x.(man(x) & (not walks(x)))", + r"all x.(man(x) -> walks(x))", +] + + +def spacer(num=45): + print("-" * num) + + +def demo(): + print("Testing configuration") + spacer() + test_config() + print() + print("Testing conversion to Prover9 format") + spacer() + test_convert_to_prover9(expressions) + print() + print("Testing proofs") + spacer() + test_prove(arguments) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/resolution.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/resolution.py new file mode 100644 index 0000000000000000000000000000000000000000..b9fd2584785d56e5495cc054e6a0351e6adc050b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/resolution.py @@ -0,0 +1,759 @@ +# Natural Language Toolkit: First-order Resolution-based Theorem Prover +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +""" +Module for a resolution-based First Order theorem prover. +""" + +import operator +from collections import defaultdict +from functools import reduce + +from nltk.inference.api import BaseProverCommand, Prover +from nltk.sem import skolemize +from nltk.sem.logic import ( + AndExpression, + ApplicationExpression, + EqualityExpression, + Expression, + IndividualVariableExpression, + NegatedExpression, + OrExpression, + Variable, + VariableExpression, + is_indvar, + unique_variable, +) + + +class ProverParseError(Exception): + pass + + +class ResolutionProver(Prover): + ANSWER_KEY = "ANSWER" + _assume_false = True + + def _prove(self, goal=None, assumptions=None, verbose=False): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in the proof + :type assumptions: list(sem.Expression) + """ + if not assumptions: + assumptions = [] + + result = None + try: + clauses = [] + if goal: + clauses.extend(clausify(-goal)) + for a in assumptions: + clauses.extend(clausify(a)) + result, clauses = self._attempt_proof(clauses) + if verbose: + print(ResolutionProverCommand._decorate_clauses(clauses)) + except RuntimeError as e: + if self._assume_false and str(e).startswith( + "maximum recursion depth exceeded" + ): + result = False + clauses = [] + else: + if verbose: + print(e) + else: + raise e + return (result, clauses) + + def _attempt_proof(self, clauses): + # map indices to lists of indices, to store attempted unifications + tried = defaultdict(list) + + i = 0 + while i < len(clauses): + if not clauses[i].is_tautology(): + # since we try clauses in order, we should start after the last + # index tried + if tried[i]: + j = tried[i][-1] + 1 + else: + j = i + 1 # nothing tried yet for 'i', so start with the next + + while j < len(clauses): + # don't: 1) unify a clause with itself, + # 2) use tautologies + if i != j and j and not clauses[j].is_tautology(): + tried[i].append(j) + newclauses = clauses[i].unify(clauses[j]) + if newclauses: + for newclause in newclauses: + newclause._parents = (i + 1, j + 1) + clauses.append(newclause) + if not len(newclause): # if there's an empty clause + return (True, clauses) + i = -1 # since we added a new clause, restart from the top + break + j += 1 + i += 1 + return (False, clauses) + + +class ResolutionProverCommand(BaseProverCommand): + def __init__(self, goal=None, assumptions=None, prover=None): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in + the proof. + :type assumptions: list(sem.Expression) + """ + if prover is not None: + assert isinstance(prover, ResolutionProver) + else: + prover = ResolutionProver() + + BaseProverCommand.__init__(self, prover, goal, assumptions) + self._clauses = None + + def prove(self, verbose=False): + """ + Perform the actual proof. Store the result to prevent unnecessary + re-proving. + """ + if self._result is None: + self._result, clauses = self._prover._prove( + self.goal(), self.assumptions(), verbose + ) + self._clauses = clauses + self._proof = ResolutionProverCommand._decorate_clauses(clauses) + return self._result + + def find_answers(self, verbose=False): + self.prove(verbose) + + answers = set() + answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY)) + for clause in self._clauses: + for term in clause: + if ( + isinstance(term, ApplicationExpression) + and term.function == answer_ex + and not isinstance(term.argument, IndividualVariableExpression) + ): + answers.add(term.argument) + return answers + + @staticmethod + def _decorate_clauses(clauses): + """ + Decorate the proof output. + """ + out = "" + max_clause_len = max(len(str(clause)) for clause in clauses) + max_seq_len = len(str(len(clauses))) + for i in range(len(clauses)): + parents = "A" + taut = "" + if clauses[i].is_tautology(): + taut = "Tautology" + if clauses[i]._parents: + parents = str(clauses[i]._parents) + parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents + seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1) + out += f"[{seq}] {clauses[i]} {parents} {taut}\n" + return out + + +class Clause(list): + def __init__(self, data): + list.__init__(self, data) + self._is_tautology = None + self._parents = None + + def unify(self, other, bindings=None, used=None, skipped=None, debug=False): + """ + Attempt to unify this Clause with the other, returning a list of + resulting, unified, Clauses. + + :param other: ``Clause`` with which to unify + :param bindings: ``BindingDict`` containing bindings that should be used + during the unification + :param used: tuple of two lists of atoms. The first lists the + atoms from 'self' that were successfully unified with atoms from + 'other'. The second lists the atoms from 'other' that were successfully + unified with atoms from 'self'. + :param skipped: tuple of two ``Clause`` objects. The first is a list of all + the atoms from the 'self' Clause that have not been unified with + anything on the path. The second is same thing for the 'other' Clause. + :param debug: bool indicating whether debug statements should print + :return: list containing all the resulting ``Clause`` objects that could be + obtained by unification + """ + if bindings is None: + bindings = BindingDict() + if used is None: + used = ([], []) + if skipped is None: + skipped = ([], []) + if isinstance(debug, bool): + debug = DebugObject(debug) + + newclauses = _iterate_first( + self, other, bindings, used, skipped, _complete_unify_path, debug + ) + + # remove subsumed clauses. make a list of all indices of subsumed + # clauses, and then remove them from the list + subsumed = [] + for i, c1 in enumerate(newclauses): + if i not in subsumed: + for j, c2 in enumerate(newclauses): + if i != j and j not in subsumed and c1.subsumes(c2): + subsumed.append(j) + result = [] + for i in range(len(newclauses)): + if i not in subsumed: + result.append(newclauses[i]) + + return result + + def isSubsetOf(self, other): + """ + Return True iff every term in 'self' is a term in 'other'. + + :param other: ``Clause`` + :return: bool + """ + for a in self: + if a not in other: + return False + return True + + def subsumes(self, other): + """ + Return True iff 'self' subsumes 'other', this is, if there is a + substitution such that every term in 'self' can be unified with a term + in 'other'. + + :param other: ``Clause`` + :return: bool + """ + negatedother = [] + for atom in other: + if isinstance(atom, NegatedExpression): + negatedother.append(atom.term) + else: + negatedother.append(-atom) + + negatedotherClause = Clause(negatedother) + + bindings = BindingDict() + used = ([], []) + skipped = ([], []) + debug = DebugObject(False) + + return ( + len( + _iterate_first( + self, + negatedotherClause, + bindings, + used, + skipped, + _subsumes_finalize, + debug, + ) + ) + > 0 + ) + + def __getslice__(self, start, end): + return Clause(list.__getslice__(self, start, end)) + + def __sub__(self, other): + return Clause([a for a in self if a not in other]) + + def __add__(self, other): + return Clause(list.__add__(self, other)) + + def is_tautology(self): + """ + Self is a tautology if it contains ground terms P and -P. The ground + term, P, must be an exact match, ie, not using unification. + """ + if self._is_tautology is not None: + return self._is_tautology + for i, a in enumerate(self): + if not isinstance(a, EqualityExpression): + j = len(self) - 1 + while j > i: + b = self[j] + if isinstance(a, NegatedExpression): + if a.term == b: + self._is_tautology = True + return True + elif isinstance(b, NegatedExpression): + if a == b.term: + self._is_tautology = True + return True + j -= 1 + self._is_tautology = False + return False + + def free(self): + return reduce(operator.or_, ((atom.free() | atom.constants()) for atom in self)) + + def replace(self, variable, expression): + """ + Replace every instance of variable with expression across every atom + in the clause + + :param variable: ``Variable`` + :param expression: ``Expression`` + """ + return Clause([atom.replace(variable, expression) for atom in self]) + + def substitute_bindings(self, bindings): + """ + Replace every binding + + :param bindings: A list of tuples mapping Variable Expressions to the + Expressions to which they are bound. + :return: ``Clause`` + """ + return Clause([atom.substitute_bindings(bindings) for atom in self]) + + def __str__(self): + return "{" + ", ".join("%s" % item for item in self) + "}" + + def __repr__(self): + return "%s" % self + + +def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug): + """ + This method facilitates movement through the terms of 'self' + """ + debug.line(f"unify({first},{second}) {bindings}") + + if not len(first) or not len(second): # if no more recursions can be performed + return finalize_method(first, second, bindings, used, skipped, debug) + else: + # explore this 'self' atom + result = _iterate_second( + first, second, bindings, used, skipped, finalize_method, debug + 1 + ) + + # skip this possible 'self' atom + newskipped = (skipped[0] + [first[0]], skipped[1]) + result += _iterate_first( + first[1:], second, bindings, used, newskipped, finalize_method, debug + 1 + ) + + try: + newbindings, newused, unused = _unify_terms( + first[0], second[0], bindings, used + ) + # Unification found, so progress with this line of unification + # put skipped and unused terms back into play for later unification. + newfirst = first[1:] + skipped[0] + unused[0] + newsecond = second[1:] + skipped[1] + unused[1] + result += _iterate_first( + newfirst, + newsecond, + newbindings, + newused, + ([], []), + finalize_method, + debug + 1, + ) + except BindingException: + # the atoms could not be unified, + pass + + return result + + +def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug): + """ + This method facilitates movement through the terms of 'other' + """ + debug.line(f"unify({first},{second}) {bindings}") + + if not len(first) or not len(second): # if no more recursions can be performed + return finalize_method(first, second, bindings, used, skipped, debug) + else: + # skip this possible pairing and move to the next + newskipped = (skipped[0], skipped[1] + [second[0]]) + result = _iterate_second( + first, second[1:], bindings, used, newskipped, finalize_method, debug + 1 + ) + + try: + newbindings, newused, unused = _unify_terms( + first[0], second[0], bindings, used + ) + # Unification found, so progress with this line of unification + # put skipped and unused terms back into play for later unification. + newfirst = first[1:] + skipped[0] + unused[0] + newsecond = second[1:] + skipped[1] + unused[1] + result += _iterate_second( + newfirst, + newsecond, + newbindings, + newused, + ([], []), + finalize_method, + debug + 1, + ) + except BindingException: + # the atoms could not be unified, + pass + + return result + + +def _unify_terms(a, b, bindings=None, used=None): + """ + This method attempts to unify two terms. Two expressions are unifiable + if there exists a substitution function S such that S(a) == S(-b). + + :param a: ``Expression`` + :param b: ``Expression`` + :param bindings: ``BindingDict`` a starting set of bindings with which + the unification must be consistent + :return: ``BindingDict`` A dictionary of the bindings required to unify + :raise ``BindingException``: If the terms cannot be unified + """ + assert isinstance(a, Expression) + assert isinstance(b, Expression) + + if bindings is None: + bindings = BindingDict() + if used is None: + used = ([], []) + + # Use resolution + if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression): + newbindings = most_general_unification(a.term, b, bindings) + newused = (used[0] + [a], used[1] + [b]) + unused = ([], []) + elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression): + newbindings = most_general_unification(a, b.term, bindings) + newused = (used[0] + [a], used[1] + [b]) + unused = ([], []) + + # Use demodulation + elif isinstance(a, EqualityExpression): + newbindings = BindingDict([(a.first.variable, a.second)]) + newused = (used[0] + [a], used[1]) + unused = ([], [b]) + elif isinstance(b, EqualityExpression): + newbindings = BindingDict([(b.first.variable, b.second)]) + newused = (used[0], used[1] + [b]) + unused = ([a], []) + + else: + raise BindingException((a, b)) + + return newbindings, newused, unused + + +def _complete_unify_path(first, second, bindings, used, skipped, debug): + if used[0] or used[1]: # if bindings were made along the path + newclause = Clause(skipped[0] + skipped[1] + first + second) + debug.line(" -> New Clause: %s" % newclause) + return [newclause.substitute_bindings(bindings)] + else: # no bindings made means no unification occurred. so no result + debug.line(" -> End") + return [] + + +def _subsumes_finalize(first, second, bindings, used, skipped, debug): + if not len(skipped[0]) and not len(first): + # If there are no skipped terms and no terms left in 'first', then + # all of the terms in the original 'self' were unified with terms + # in 'other'. Therefore, there exists a binding (this one) such that + # every term in self can be unified with a term in other, which + # is the definition of subsumption. + return [True] + else: + return [] + + +def clausify(expression): + """ + Skolemize, clausify, and standardize the variables apart. + """ + clause_list = [] + for clause in _clausify(skolemize(expression)): + for free in clause.free(): + if is_indvar(free.name): + newvar = VariableExpression(unique_variable()) + clause = clause.replace(free, newvar) + clause_list.append(clause) + return clause_list + + +def _clausify(expression): + """ + :param expression: a skolemized expression in CNF + """ + if isinstance(expression, AndExpression): + return _clausify(expression.first) + _clausify(expression.second) + elif isinstance(expression, OrExpression): + first = _clausify(expression.first) + second = _clausify(expression.second) + assert len(first) == 1 + assert len(second) == 1 + return [first[0] + second[0]] + elif isinstance(expression, EqualityExpression): + return [Clause([expression])] + elif isinstance(expression, ApplicationExpression): + return [Clause([expression])] + elif isinstance(expression, NegatedExpression): + if isinstance(expression.term, ApplicationExpression): + return [Clause([expression])] + elif isinstance(expression.term, EqualityExpression): + return [Clause([expression])] + raise ProverParseError() + + +class BindingDict: + def __init__(self, binding_list=None): + """ + :param binding_list: list of (``AbstractVariableExpression``, ``AtomicExpression``) to initialize the dictionary + """ + self.d = {} + + if binding_list: + for (v, b) in binding_list: + self[v] = b + + def __setitem__(self, variable, binding): + """ + A binding is consistent with the dict if its variable is not already bound, OR if its + variable is already bound to its argument. + + :param variable: ``Variable`` The variable to bind + :param binding: ``Expression`` The atomic to which 'variable' should be bound + :raise BindingException: If the variable cannot be bound in this dictionary + """ + assert isinstance(variable, Variable) + assert isinstance(binding, Expression) + + try: + existing = self[variable] + except KeyError: + existing = None + + if not existing or binding == existing: + self.d[variable] = binding + elif isinstance(binding, IndividualVariableExpression): + # Since variable is already bound, try to bind binding to variable + try: + existing = self[binding.variable] + except KeyError: + existing = None + + binding2 = VariableExpression(variable) + + if not existing or binding2 == existing: + self.d[binding.variable] = binding2 + else: + raise BindingException( + "Variable %s already bound to another " "value" % (variable) + ) + else: + raise BindingException( + "Variable %s already bound to another " "value" % (variable) + ) + + def __getitem__(self, variable): + """ + Return the expression to which 'variable' is bound + """ + assert isinstance(variable, Variable) + + intermediate = self.d[variable] + while intermediate: + try: + intermediate = self.d[intermediate] + except KeyError: + return intermediate + + def __contains__(self, item): + return item in self.d + + def __add__(self, other): + """ + :param other: ``BindingDict`` The dict with which to combine self + :return: ``BindingDict`` A new dict containing all the elements of both parameters + :raise BindingException: If the parameter dictionaries are not consistent with each other + """ + try: + combined = BindingDict() + for v in self.d: + combined[v] = self.d[v] + for v in other.d: + combined[v] = other.d[v] + return combined + except BindingException as e: + raise BindingException( + "Attempting to add two contradicting " + "BindingDicts: '%s' and '%s'" % (self, other) + ) from e + + def __len__(self): + return len(self.d) + + def __str__(self): + data_str = ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) + return "{" + data_str + "}" + + def __repr__(self): + return "%s" % self + + +def most_general_unification(a, b, bindings=None): + """ + Find the most general unification of the two given expressions + + :param a: ``Expression`` + :param b: ``Expression`` + :param bindings: ``BindingDict`` a starting set of bindings with which the + unification must be consistent + :return: a list of bindings + :raise BindingException: if the Expressions cannot be unified + """ + if bindings is None: + bindings = BindingDict() + + if a == b: + return bindings + elif isinstance(a, IndividualVariableExpression): + return _mgu_var(a, b, bindings) + elif isinstance(b, IndividualVariableExpression): + return _mgu_var(b, a, bindings) + elif isinstance(a, ApplicationExpression) and isinstance(b, ApplicationExpression): + return most_general_unification( + a.function, b.function, bindings + ) + most_general_unification(a.argument, b.argument, bindings) + raise BindingException((a, b)) + + +def _mgu_var(var, expression, bindings): + if var.variable in expression.free() | expression.constants(): + raise BindingException((var, expression)) + else: + return BindingDict([(var.variable, expression)]) + bindings + + +class BindingException(Exception): + def __init__(self, arg): + if isinstance(arg, tuple): + Exception.__init__(self, "'%s' cannot be bound to '%s'" % arg) + else: + Exception.__init__(self, arg) + + +class UnificationException(Exception): + def __init__(self, a, b): + Exception.__init__(self, f"'{a}' cannot unify with '{b}'") + + +class DebugObject: + def __init__(self, enabled=True, indent=0): + self.enabled = enabled + self.indent = indent + + def __add__(self, i): + return DebugObject(self.enabled, self.indent + i) + + def line(self, line): + if self.enabled: + print(" " * self.indent + line) + + +def testResolutionProver(): + resolution_test(r"man(x)") + resolution_test(r"(man(x) -> man(x))") + resolution_test(r"(man(x) -> --man(x))") + resolution_test(r"-(man(x) and -man(x))") + resolution_test(r"(man(x) or -man(x))") + resolution_test(r"(man(x) -> man(x))") + resolution_test(r"-(man(x) and -man(x))") + resolution_test(r"(man(x) or -man(x))") + resolution_test(r"(man(x) -> man(x))") + resolution_test(r"(man(x) iff man(x))") + resolution_test(r"-(man(x) iff -man(x))") + resolution_test("all x.man(x)") + resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))") + resolution_test("some x.all y.sees(x,y)") + + p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))") + p2 = Expression.fromstring(r"man(Socrates)") + c = Expression.fromstring(r"mortal(Socrates)") + print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") + + p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))") + p2 = Expression.fromstring(r"man(John)") + c = Expression.fromstring(r"some y.walks(y)") + print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") + + p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))") + c = Expression.fromstring(r"some e0.walk(e0,mary)") + print(f"{p} |- {c}: {ResolutionProver().prove(c, [p])}") + + +def resolution_test(e): + f = Expression.fromstring(e) + t = ResolutionProver().prove(f) + print(f"|- {f}: {t}") + + +def test_clausify(): + lexpr = Expression.fromstring + + print(clausify(lexpr("P(x) | Q(x)"))) + print(clausify(lexpr("(P(x) & Q(x)) | R(x)"))) + print(clausify(lexpr("P(x) | (Q(x) & R(x))"))) + print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))"))) + + print(clausify(lexpr("P(x) | Q(x) | R(x)"))) + print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)"))) + + print(clausify(lexpr("exists x.P(x) | Q(x)"))) + + print(clausify(lexpr("-(-P(x) & Q(x))"))) + print(clausify(lexpr("P(x) <-> Q(x)"))) + print(clausify(lexpr("-(P(x) <-> Q(x))"))) + print(clausify(lexpr("-(all x.P(x))"))) + print(clausify(lexpr("-(some x.P(x))"))) + + print(clausify(lexpr("some x.P(x)"))) + print(clausify(lexpr("some x.all y.P(x,y)"))) + print(clausify(lexpr("all y.some x.P(x,y)"))) + print(clausify(lexpr("all z.all y.some x.P(x,y,z)"))) + print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))"))) + + +def demo(): + test_clausify() + print() + testResolutionProver() + print() + + p = Expression.fromstring("man(x)") + print(ResolutionProverCommand(p, [p]).prove()) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/inference/tableau.py b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/tableau.py new file mode 100644 index 0000000000000000000000000000000000000000..2752757aabcb58806ba2f4cb01779e340d16c08d --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/inference/tableau.py @@ -0,0 +1,712 @@ +# Natural Language Toolkit: First-Order Tableau Theorem Prover +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Dan Garrette +# +# URL: +# For license information, see LICENSE.TXT + +""" +Module for a tableau-based First Order theorem prover. +""" + +from nltk.inference.api import BaseProverCommand, Prover +from nltk.internals import Counter +from nltk.sem.logic import ( + AbstractVariableExpression, + AllExpression, + AndExpression, + ApplicationExpression, + EqualityExpression, + ExistsExpression, + Expression, + FunctionVariableExpression, + IffExpression, + ImpExpression, + LambdaExpression, + NegatedExpression, + OrExpression, + Variable, + VariableExpression, + unique_variable, +) + +_counter = Counter() + + +class ProverParseError(Exception): + pass + + +class TableauProver(Prover): + _assume_false = False + + def _prove(self, goal=None, assumptions=None, verbose=False): + if not assumptions: + assumptions = [] + + result = None + try: + agenda = Agenda() + if goal: + agenda.put(-goal) + agenda.put_all(assumptions) + debugger = Debug(verbose) + result = self._attempt_proof(agenda, set(), set(), debugger) + except RuntimeError as e: + if self._assume_false and str(e).startswith( + "maximum recursion depth exceeded" + ): + result = False + else: + if verbose: + print(e) + else: + raise e + return (result, "\n".join(debugger.lines)) + + def _attempt_proof(self, agenda, accessible_vars, atoms, debug): + (current, context), category = agenda.pop_first() + + # if there's nothing left in the agenda, and we haven't closed the path + if not current: + debug.line("AGENDA EMPTY") + return False + + proof_method = { + Categories.ATOM: self._attempt_proof_atom, + Categories.PROP: self._attempt_proof_prop, + Categories.N_ATOM: self._attempt_proof_n_atom, + Categories.N_PROP: self._attempt_proof_n_prop, + Categories.APP: self._attempt_proof_app, + Categories.N_APP: self._attempt_proof_n_app, + Categories.N_EQ: self._attempt_proof_n_eq, + Categories.D_NEG: self._attempt_proof_d_neg, + Categories.N_ALL: self._attempt_proof_n_all, + Categories.N_EXISTS: self._attempt_proof_n_some, + Categories.AND: self._attempt_proof_and, + Categories.N_OR: self._attempt_proof_n_or, + Categories.N_IMP: self._attempt_proof_n_imp, + Categories.OR: self._attempt_proof_or, + Categories.IMP: self._attempt_proof_imp, + Categories.N_AND: self._attempt_proof_n_and, + Categories.IFF: self._attempt_proof_iff, + Categories.N_IFF: self._attempt_proof_n_iff, + Categories.EQ: self._attempt_proof_eq, + Categories.EXISTS: self._attempt_proof_some, + Categories.ALL: self._attempt_proof_all, + }[category] + + debug.line((current, context)) + return proof_method(current, context, agenda, accessible_vars, atoms, debug) + + def _attempt_proof_atom( + self, current, context, agenda, accessible_vars, atoms, debug + ): + # Check if the branch is closed. Return 'True' if it is + if (current, True) in atoms: + debug.line("CLOSED", 1) + return True + + if context: + if isinstance(context.term, NegatedExpression): + current = current.negate() + agenda.put(context(current).simplify()) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + else: + # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, + accessible_vars | set(current.args), + atoms | {(current, False)}, + debug + 1, + ) + + def _attempt_proof_n_atom( + self, current, context, agenda, accessible_vars, atoms, debug + ): + # Check if the branch is closed. Return 'True' if it is + if (current.term, False) in atoms: + debug.line("CLOSED", 1) + return True + + if context: + if isinstance(context.term, NegatedExpression): + current = current.negate() + agenda.put(context(current).simplify()) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + else: + # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, + accessible_vars | set(current.term.args), + atoms | {(current.term, True)}, + debug + 1, + ) + + def _attempt_proof_prop( + self, current, context, agenda, accessible_vars, atoms, debug + ): + # Check if the branch is closed. Return 'True' if it is + if (current, True) in atoms: + debug.line("CLOSED", 1) + return True + + # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, accessible_vars, atoms | {(current, False)}, debug + 1 + ) + + def _attempt_proof_n_prop( + self, current, context, agenda, accessible_vars, atoms, debug + ): + # Check if the branch is closed. Return 'True' if it is + if (current.term, False) in atoms: + debug.line("CLOSED", 1) + return True + + # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, accessible_vars, atoms | {(current.term, True)}, debug + 1 + ) + + def _attempt_proof_app( + self, current, context, agenda, accessible_vars, atoms, debug + ): + f, args = current.uncurry() + for i, arg in enumerate(args): + if not TableauProver.is_atom(arg): + ctx = f + nv = Variable("X%s" % _counter.get()) + for j, a in enumerate(args): + ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) + if context: + ctx = context(ctx).simplify() + ctx = LambdaExpression(nv, ctx) + agenda.put(arg, ctx) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + raise Exception("If this method is called, there must be a non-atomic argument") + + def _attempt_proof_n_app( + self, current, context, agenda, accessible_vars, atoms, debug + ): + f, args = current.term.uncurry() + for i, arg in enumerate(args): + if not TableauProver.is_atom(arg): + ctx = f + nv = Variable("X%s" % _counter.get()) + for j, a in enumerate(args): + ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) + if context: + # combine new context with existing + ctx = context(ctx).simplify() + ctx = LambdaExpression(nv, -ctx) + agenda.put(-arg, ctx) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + raise Exception("If this method is called, there must be a non-atomic argument") + + def _attempt_proof_n_eq( + self, current, context, agenda, accessible_vars, atoms, debug + ): + ########################################################################### + # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b' + ########################################################################### + if current.term.first == current.term.second: + debug.line("CLOSED", 1) + return True + + agenda[Categories.N_EQ].add((current, context)) + current._exhausted = True + return self._attempt_proof( + agenda, + accessible_vars | {current.term.first, current.term.second}, + atoms, + debug + 1, + ) + + def _attempt_proof_d_neg( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda.put(current.term.term, context) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_all( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda[Categories.EXISTS].add( + (ExistsExpression(current.term.variable, -current.term.term), context) + ) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_some( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda[Categories.ALL].add( + (AllExpression(current.term.variable, -current.term.term), context) + ) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_and( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda.put(current.first, context) + agenda.put(current.second, context) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_or( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda.put(-current.term.first, context) + agenda.put(-current.term.second, context) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_imp( + self, current, context, agenda, accessible_vars, atoms, debug + ): + agenda.put(current.term.first, context) + agenda.put(-current.term.second, context) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_or( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_agenda = agenda.clone() + agenda.put(current.first, context) + new_agenda.put(current.second, context) + return self._attempt_proof( + agenda, accessible_vars, atoms, debug + 1 + ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_imp( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_agenda = agenda.clone() + agenda.put(-current.first, context) + new_agenda.put(current.second, context) + return self._attempt_proof( + agenda, accessible_vars, atoms, debug + 1 + ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_and( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_agenda = agenda.clone() + agenda.put(-current.term.first, context) + new_agenda.put(-current.term.second, context) + return self._attempt_proof( + agenda, accessible_vars, atoms, debug + 1 + ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_iff( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_agenda = agenda.clone() + agenda.put(current.first, context) + agenda.put(current.second, context) + new_agenda.put(-current.first, context) + new_agenda.put(-current.second, context) + return self._attempt_proof( + agenda, accessible_vars, atoms, debug + 1 + ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_n_iff( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_agenda = agenda.clone() + agenda.put(current.term.first, context) + agenda.put(-current.term.second, context) + new_agenda.put(-current.term.first, context) + new_agenda.put(current.term.second, context) + return self._attempt_proof( + agenda, accessible_vars, atoms, debug + 1 + ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) + + def _attempt_proof_eq( + self, current, context, agenda, accessible_vars, atoms, debug + ): + ######################################################################### + # Since 'current' is of the form '(a = b)', replace ALL free instances + # of 'a' with 'b' + ######################################################################### + agenda.put_atoms(atoms) + agenda.replace_all(current.first, current.second) + accessible_vars.discard(current.first) + agenda.mark_neqs_fresh() + return self._attempt_proof(agenda, accessible_vars, set(), debug + 1) + + def _attempt_proof_some( + self, current, context, agenda, accessible_vars, atoms, debug + ): + new_unique_variable = VariableExpression(unique_variable()) + agenda.put(current.term.replace(current.variable, new_unique_variable), context) + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 + ) + + def _attempt_proof_all( + self, current, context, agenda, accessible_vars, atoms, debug + ): + try: + current._used_vars + except AttributeError: + current._used_vars = set() + + # if there are accessible_vars on the path + if accessible_vars: + # get the set of bound variables that have not be used by this AllExpression + bv_available = accessible_vars - current._used_vars + + if bv_available: + variable_to_use = list(bv_available)[0] + debug.line("--> Using '%s'" % variable_to_use, 2) + current._used_vars |= {variable_to_use} + agenda.put( + current.term.replace(current.variable, variable_to_use), context + ) + agenda[Categories.ALL].add((current, context)) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + else: + # no more available variables to substitute + debug.line("--> Variables Exhausted", 2) + current._exhausted = True + agenda[Categories.ALL].add((current, context)) + return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) + + else: + new_unique_variable = VariableExpression(unique_variable()) + debug.line("--> Using '%s'" % new_unique_variable, 2) + current._used_vars |= {new_unique_variable} + agenda.put( + current.term.replace(current.variable, new_unique_variable), context + ) + agenda[Categories.ALL].add((current, context)) + agenda.mark_alls_fresh() + return self._attempt_proof( + agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 + ) + + @staticmethod + def is_atom(e): + if isinstance(e, NegatedExpression): + e = e.term + + if isinstance(e, ApplicationExpression): + for arg in e.args: + if not TableauProver.is_atom(arg): + return False + return True + elif isinstance(e, AbstractVariableExpression) or isinstance( + e, LambdaExpression + ): + return True + else: + return False + + +class TableauProverCommand(BaseProverCommand): + def __init__(self, goal=None, assumptions=None, prover=None): + """ + :param goal: Input expression to prove + :type goal: sem.Expression + :param assumptions: Input expressions to use as assumptions in + the proof. + :type assumptions: list(sem.Expression) + """ + if prover is not None: + assert isinstance(prover, TableauProver) + else: + prover = TableauProver() + + BaseProverCommand.__init__(self, prover, goal, assumptions) + + +class Agenda: + def __init__(self): + self.sets = tuple(set() for i in range(21)) + + def clone(self): + new_agenda = Agenda() + set_list = [s.copy() for s in self.sets] + + new_allExs = set() + for allEx, _ in set_list[Categories.ALL]: + new_allEx = AllExpression(allEx.variable, allEx.term) + try: + new_allEx._used_vars = {used for used in allEx._used_vars} + except AttributeError: + new_allEx._used_vars = set() + new_allExs.add((new_allEx, None)) + set_list[Categories.ALL] = new_allExs + + set_list[Categories.N_EQ] = { + (NegatedExpression(n_eq.term), ctx) + for (n_eq, ctx) in set_list[Categories.N_EQ] + } + + new_agenda.sets = tuple(set_list) + return new_agenda + + def __getitem__(self, index): + return self.sets[index] + + def put(self, expression, context=None): + if isinstance(expression, AllExpression): + ex_to_add = AllExpression(expression.variable, expression.term) + try: + ex_to_add._used_vars = {used for used in expression._used_vars} + except AttributeError: + ex_to_add._used_vars = set() + else: + ex_to_add = expression + self.sets[self._categorize_expression(ex_to_add)].add((ex_to_add, context)) + + def put_all(self, expressions): + for expression in expressions: + self.put(expression) + + def put_atoms(self, atoms): + for atom, neg in atoms: + if neg: + self[Categories.N_ATOM].add((-atom, None)) + else: + self[Categories.ATOM].add((atom, None)) + + def pop_first(self): + """Pop the first expression that appears in the agenda""" + for i, s in enumerate(self.sets): + if s: + if i in [Categories.N_EQ, Categories.ALL]: + for ex in s: + try: + if not ex[0]._exhausted: + s.remove(ex) + return (ex, i) + except AttributeError: + s.remove(ex) + return (ex, i) + else: + return (s.pop(), i) + return ((None, None), None) + + def replace_all(self, old, new): + for s in self.sets: + for ex, ctx in s: + ex.replace(old.variable, new) + if ctx is not None: + ctx.replace(old.variable, new) + + def mark_alls_fresh(self): + for u, _ in self.sets[Categories.ALL]: + u._exhausted = False + + def mark_neqs_fresh(self): + for neq, _ in self.sets[Categories.N_EQ]: + neq._exhausted = False + + def _categorize_expression(self, current): + if isinstance(current, NegatedExpression): + return self._categorize_NegatedExpression(current) + elif isinstance(current, FunctionVariableExpression): + return Categories.PROP + elif TableauProver.is_atom(current): + return Categories.ATOM + elif isinstance(current, AllExpression): + return Categories.ALL + elif isinstance(current, AndExpression): + return Categories.AND + elif isinstance(current, OrExpression): + return Categories.OR + elif isinstance(current, ImpExpression): + return Categories.IMP + elif isinstance(current, IffExpression): + return Categories.IFF + elif isinstance(current, EqualityExpression): + return Categories.EQ + elif isinstance(current, ExistsExpression): + return Categories.EXISTS + elif isinstance(current, ApplicationExpression): + return Categories.APP + else: + raise ProverParseError("cannot categorize %s" % current.__class__.__name__) + + def _categorize_NegatedExpression(self, current): + negated = current.term + + if isinstance(negated, NegatedExpression): + return Categories.D_NEG + elif isinstance(negated, FunctionVariableExpression): + return Categories.N_PROP + elif TableauProver.is_atom(negated): + return Categories.N_ATOM + elif isinstance(negated, AllExpression): + return Categories.N_ALL + elif isinstance(negated, AndExpression): + return Categories.N_AND + elif isinstance(negated, OrExpression): + return Categories.N_OR + elif isinstance(negated, ImpExpression): + return Categories.N_IMP + elif isinstance(negated, IffExpression): + return Categories.N_IFF + elif isinstance(negated, EqualityExpression): + return Categories.N_EQ + elif isinstance(negated, ExistsExpression): + return Categories.N_EXISTS + elif isinstance(negated, ApplicationExpression): + return Categories.N_APP + else: + raise ProverParseError("cannot categorize %s" % negated.__class__.__name__) + + +class Debug: + def __init__(self, verbose, indent=0, lines=None): + self.verbose = verbose + self.indent = indent + + if not lines: + lines = [] + self.lines = lines + + def __add__(self, increment): + return Debug(self.verbose, self.indent + 1, self.lines) + + def line(self, data, indent=0): + if isinstance(data, tuple): + ex, ctx = data + if ctx: + data = f"{ex}, {ctx}" + else: + data = "%s" % ex + + if isinstance(ex, AllExpression): + try: + used_vars = "[%s]" % ( + ",".join("%s" % ve.variable.name for ve in ex._used_vars) + ) + data += ": %s" % used_vars + except AttributeError: + data += ": []" + + newline = "{}{}".format(" " * (self.indent + indent), data) + self.lines.append(newline) + + if self.verbose: + print(newline) + + +class Categories: + ATOM = 0 + PROP = 1 + N_ATOM = 2 + N_PROP = 3 + APP = 4 + N_APP = 5 + N_EQ = 6 + D_NEG = 7 + N_ALL = 8 + N_EXISTS = 9 + AND = 10 + N_OR = 11 + N_IMP = 12 + OR = 13 + IMP = 14 + N_AND = 15 + IFF = 16 + N_IFF = 17 + EQ = 18 + EXISTS = 19 + ALL = 20 + + +def testTableauProver(): + tableau_test("P | -P") + tableau_test("P & -P") + tableau_test("Q", ["P", "(P -> Q)"]) + tableau_test("man(x)") + tableau_test("(man(x) -> man(x))") + tableau_test("(man(x) -> --man(x))") + tableau_test("-(man(x) and -man(x))") + tableau_test("(man(x) or -man(x))") + tableau_test("(man(x) -> man(x))") + tableau_test("-(man(x) and -man(x))") + tableau_test("(man(x) or -man(x))") + tableau_test("(man(x) -> man(x))") + tableau_test("(man(x) iff man(x))") + tableau_test("-(man(x) iff -man(x))") + tableau_test("all x.man(x)") + tableau_test("all x.all y.((x = y) -> (y = x))") + tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))") + # tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))') + # tableau_test('some x.all y.sees(x,y)') + + p1 = "all x.(man(x) -> mortal(x))" + p2 = "man(Socrates)" + c = "mortal(Socrates)" + tableau_test(c, [p1, p2]) + + p1 = "all x.(man(x) -> walks(x))" + p2 = "man(John)" + c = "some y.walks(y)" + tableau_test(c, [p1, p2]) + + p = "((x = y) & walks(y))" + c = "walks(x)" + tableau_test(c, [p]) + + p = "((x = y) & ((y = z) & (z = w)))" + c = "(x = w)" + tableau_test(c, [p]) + + p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))" + c = "some e0.walk(e0,mary)" + tableau_test(c, [p]) + + c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))" + tableau_test(c) + + +# p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))' +# c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))' +# tableau_test(c, [p]) + + +def testHigherOrderTableauProver(): + tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) + tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"]) + tableau_test( + "believe(j, lie(b))", ["lie(b)"] + ) # how do we capture that John believes all things that are true + tableau_test( + "believe(j, know(b, cheat(b)))", + ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"], + ) + tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"]) + + tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"]) + tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) + + +def tableau_test(c, ps=None, verbose=False): + pc = Expression.fromstring(c) + pps = [Expression.fromstring(p) for p in ps] if ps else [] + if not ps: + ps = [] + print( + "%s |- %s: %s" + % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose)) + ) + + +def demo(): + testTableauProver() + testHigherOrderTableauProver() + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/jsontags.py b/.eggs/nltk-3.8-py3.10.egg/nltk/jsontags.py new file mode 100644 index 0000000000000000000000000000000000000000..42ab3fb4130a4096eb28a7a7f117347928802156 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/jsontags.py @@ -0,0 +1,65 @@ +# Natural Language Toolkit: JSON Encoder/Decoder Helpers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Xu +# +# URL: +# For license information, see LICENSE.TXT + +""" +Register JSON tags, so the nltk data loader knows what module and class to look for. + +NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified +"tag:nltk.org,2011:" prefix is also accepted in case anyone ends up +using it. +""" + +import json + +json_tags = {} + +TAG_PREFIX = "!" + + +def register_tag(cls): + """ + Decorates a class to register it's json tag. + """ + json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls + return cls + + +class JSONTaggedEncoder(json.JSONEncoder): + def default(self, obj): + obj_tag = getattr(obj, "json_tag", None) + if obj_tag is None: + return super().default(obj) + obj_tag = TAG_PREFIX + obj_tag + obj = obj.encode_json_obj() + return {obj_tag: obj} + + +class JSONTaggedDecoder(json.JSONDecoder): + def decode(self, s): + return self.decode_obj(super().decode(s)) + + @classmethod + def decode_obj(cls, obj): + # Decode nested objects first. + if isinstance(obj, dict): + obj = {key: cls.decode_obj(val) for (key, val) in obj.items()} + elif isinstance(obj, list): + obj = list(cls.decode_obj(val) for val in obj) + # Check if we have a tagged object. + if not isinstance(obj, dict) or len(obj) != 1: + return obj + obj_tag = next(iter(obj.keys())) + if not obj_tag.startswith("!"): + return obj + if obj_tag not in json_tags: + raise ValueError("Unknown tag", obj_tag) + obj_cls = json_tags[obj_tag] + return obj_cls.decode_json_obj(obj[obj_tag]) + + +__all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4eae4923c109b3ad6870dd2a98a44976532d5c4c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/__init__.py @@ -0,0 +1,235 @@ +# Natural Language Toolkit: Language Models +# +# Copyright (C) 2001-2022 NLTK Project +# Authors: Ilia Kurenkov +# URL: >> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']] + +If we want to train a bigram model, we need to turn this text into bigrams. +Here's what the first sentence of our text would look like if we use a function +from NLTK for this. + + >>> from nltk.util import bigrams + >>> list(bigrams(text[0])) + [('a', 'b'), ('b', 'c')] + +Notice how "b" occurs both as the first and second member of different bigrams +but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences +start with "a" and end with "c"? +A standard way to deal with this is to add special "padding" symbols to the +sentence before splitting it into ngrams. +Fortunately, NLTK also has a function for that, let's see what it does to the +first sentence. + + >>> from nltk.util import pad_sequence + >>> list(pad_sequence(text[0], + ... pad_left=True, + ... left_pad_symbol="", + ... pad_right=True, + ... right_pad_symbol="", + ... n=2)) + ['', 'a', 'b', 'c', ''] + +Note the `n` argument, that tells the function we need padding for bigrams. +Now, passing all these parameters every time is tedious and in most cases they +can be safely assumed as defaults anyway. +Thus our module provides a convenience function that has all these arguments +already set while the other arguments remain the same as for `pad_sequence`. + + >>> from nltk.lm.preprocessing import pad_both_ends + >>> list(pad_both_ends(text[0], n=2)) + ['', 'a', 'b', 'c', ''] + +Combining the two parts discussed so far we get the following preparation steps +for one sentence. + + >>> list(bigrams(pad_both_ends(text[0], n=2))) + [('', 'a'), ('a', 'b'), ('b', 'c'), ('c', '')] + +To make our model more robust we could also train it on unigrams (single words) +as well as bigrams, its main source of information. +NLTK once again helpfully provides a function called `everygrams`. +While not the most efficient, it is conceptually simple. + + + >>> from nltk.util import everygrams + >>> padded_bigrams = list(pad_both_ends(text[0], n=2)) + >>> list(everygrams(padded_bigrams, max_len=2)) + [('',), ('', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', ''), ('',)] + +We are almost ready to start counting ngrams, just one more step left. +During training and evaluation our model will rely on a vocabulary that +defines which words are "known" to the model. +To create this vocabulary we need to pad our sentences (just like for counting +ngrams) and then combine the sentences into one flat stream of words. + + >>> from nltk.lm.preprocessing import flatten + >>> list(flatten(pad_both_ends(sent, n=2) for sent in text)) + ['', 'a', 'b', 'c', '', '', 'a', 'c', 'd', 'c', 'e', 'f', ''] + +In most cases we want to use the same text as the source for both vocabulary +and ngram counts. +Now that we understand what this means for our preprocessing, we can simply import +a function that does everything for us. + + >>> from nltk.lm.preprocessing import padded_everygram_pipeline + >>> train, vocab = padded_everygram_pipeline(2, text) + +So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy +iterators. They are evaluated on demand at training time. + + +Training +======== +Having prepared our data we are ready to start training a model. +As a simple example, let us train a Maximum Likelihood Estimator (MLE). +We only need to specify the highest ngram order to instantiate it. + + >>> from nltk.lm import MLE + >>> lm = MLE(2) + +This automatically creates an empty vocabulary... + + >>> len(lm.vocab) + 0 + +... which gets filled as we fit the model. + + >>> lm.fit(train, vocab) + >>> print(lm.vocab) + + >>> len(lm.vocab) + 9 + +The vocabulary helps us handle words that have not occurred during training. + + >>> lm.vocab.lookup(text[0]) + ('a', 'b', 'c') + >>> lm.vocab.lookup(["aliens", "from", "Mars"]) + ('', '', '') + +Moreover, in some cases we want to ignore words that we did see during training +but that didn't occur frequently enough, to provide us useful information. +You can tell the vocabulary to ignore such words. +To find out how that works, check out the docs for the `Vocabulary` class. + + +Using a Trained Model +===================== +When it comes to ngram models the training boils down to counting up the ngrams +from the training corpus. + + >>> print(lm.counts) + + +This provides a convenient interface to access counts for unigrams... + + >>> lm.counts['a'] + 2 + +...and bigrams (in this case "a b") + + >>> lm.counts[['a']]['b'] + 1 + +And so on. However, the real purpose of training a language model is to have it +score how probable words are in certain contexts. +This being MLE, the model returns the item's relative frequency as its score. + + >>> lm.score("a") + 0.15384615384615385 + +Items that are not seen during training are mapped to the vocabulary's +"unknown label" token. This is "" by default. + + >>> lm.score("") == lm.score("aliens") + True + +Here's how you get the score for a word given some preceding context. +For example we want to know what is the chance that "b" is preceded by "a". + + >>> lm.score("b", ["a"]) + 0.5 + +To avoid underflow when working with many small score values it makes sense to +take their logarithm. +For convenience this can be done with the `logscore` method. + + >>> lm.logscore("a") + -2.700439718141092 + +Building on this method, we can also evaluate our model's cross-entropy and +perplexity with respect to sequences of ngrams. + + >>> test = [('a', 'b'), ('c', 'd')] + >>> lm.entropy(test) + 1.292481250360578 + >>> lm.perplexity(test) + 2.449489742783178 + +It is advisable to preprocess your test text exactly the same way as you did +the training text. + +One cool feature of ngram models is that they can be used to generate text. + + >>> lm.generate(1, random_seed=3) + '' + >>> lm.generate(5, random_seed=3) + ['', 'a', 'b', 'c', 'd'] + +Provide `random_seed` if you want to consistently reproduce the same text all +other things being equal. Here we are using it to test the examples. + +You can also condition your generation on some preceding text with the `context` +argument. + + >>> lm.generate(5, text_seed=['c'], random_seed=3) + ['', 'c', 'd', 'c', 'd'] + +Note that an ngram model is restricted in how much preceding context it can +take into account. For example, a trigram model can only condition its output +on 2 preceding words. If you pass in a 4-word context, the first two words +will be ignored. +""" + +from nltk.lm.counter import NgramCounter +from nltk.lm.models import ( + MLE, + AbsoluteDiscountingInterpolated, + KneserNeyInterpolated, + Laplace, + Lidstone, + StupidBackoff, + WittenBellInterpolated, +) +from nltk.lm.vocabulary import Vocabulary + +__all__ = [ + "Vocabulary", + "NgramCounter", + "MLE", + "Lidstone", + "Laplace", + "WittenBellInterpolated", + "KneserNeyInterpolated", + "AbsoluteDiscountingInterpolated", + "StupidBackoff", +] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/counter.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/counter.py new file mode 100644 index 0000000000000000000000000000000000000000..305513ac57edcfe36de134342037e5e2caf740ee --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/counter.py @@ -0,0 +1,163 @@ +# Natural Language Toolkit +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +""" +Language Model Counter +---------------------- +""" + +from collections import defaultdict +from collections.abc import Sequence + +from nltk.probability import ConditionalFreqDist, FreqDist + + +class NgramCounter: + """Class for counting ngrams. + + Will count any ngram sequence you give it ;) + + First we need to make sure we are feeding the counter sentences of ngrams. + + >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]] + >>> from nltk.util import ngrams + >>> text_bigrams = [ngrams(sent, 2) for sent in text] + >>> text_unigrams = [ngrams(sent, 1) for sent in text] + + The counting itself is very simple. + + >>> from nltk.lm import NgramCounter + >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams) + + You can conveniently access ngram counts using standard python dictionary notation. + String keys will give you unigram counts. + + >>> ngram_counts['a'] + 2 + >>> ngram_counts['aliens'] + 0 + + If you want to access counts for higher order ngrams, use a list or a tuple. + These are treated as "context" keys, so what you get is a frequency distribution + over all continuations after the given context. + + >>> sorted(ngram_counts[['a']].items()) + [('b', 1), ('c', 1)] + >>> sorted(ngram_counts[('a',)].items()) + [('b', 1), ('c', 1)] + + This is equivalent to specifying explicitly the order of the ngram (in this case + 2 for bigram) and indexing on the context. + + >>> ngram_counts[2][('a',)] is ngram_counts[['a']] + True + + Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples! + It is generally advisable to use the less verbose and more flexible square + bracket notation. + + To get the count of the full ngram "a b", do this: + + >>> ngram_counts[['a']]['b'] + 1 + + Specifying the ngram order as a number can be useful for accessing all ngrams + in that order. + + >>> ngram_counts[2] + + + The keys of this `ConditionalFreqDist` are the contexts we discussed earlier. + Unigrams can also be accessed with a human-friendly alias. + + >>> ngram_counts.unigrams is ngram_counts[1] + True + + Similarly to `collections.Counter`, you can update counts after initialization. + + >>> ngram_counts['e'] + 0 + >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)]) + >>> ngram_counts['e'] + 1 + + """ + + def __init__(self, ngram_text=None): + """Creates a new NgramCounter. + + If `ngram_text` is specified, counts ngrams from it, otherwise waits for + `update` method to be called explicitly. + + :param ngram_text: Optional text containing sentences of ngrams, as for `update` method. + :type ngram_text: Iterable(Iterable(tuple(str))) or None + + """ + self._counts = defaultdict(ConditionalFreqDist) + self._counts[1] = self.unigrams = FreqDist() + + if ngram_text: + self.update(ngram_text) + + def update(self, ngram_text): + """Updates ngram counts from `ngram_text`. + + Expects `ngram_text` to be a sequence of sentences (sequences). + Each sentence consists of ngrams as tuples of strings. + + :param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams. + :raises TypeError: if the ngrams are not tuples. + + """ + + for sent in ngram_text: + for ngram in sent: + if not isinstance(ngram, tuple): + raise TypeError( + "Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram)) + ) + + ngram_order = len(ngram) + if ngram_order == 1: + self.unigrams[ngram[0]] += 1 + continue + + context, word = ngram[:-1], ngram[-1] + self[ngram_order][context][word] += 1 + + def N(self): + """Returns grand total number of ngrams stored. + + This includes ngrams from all orders, so some duplication is expected. + :rtype: int + + >>> from nltk.lm import NgramCounter + >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]]) + >>> counts.N() + 3 + + """ + return sum(val.N() for val in self._counts.values()) + + def __getitem__(self, item): + """User-friendly access to ngram counts.""" + if isinstance(item, int): + return self._counts[item] + elif isinstance(item, str): + return self._counts.__getitem__(1)[item] + elif isinstance(item, Sequence): + return self._counts.__getitem__(len(item) + 1)[tuple(item)] + + def __str__(self): + return "<{} with {} ngram orders and {} ngrams>".format( + self.__class__.__name__, len(self._counts), self.N() + ) + + def __len__(self): + return self._counts.__len__() + + def __contains__(self, item): + return item in self._counts diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/models.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/models.py new file mode 100644 index 0000000000000000000000000000000000000000..62e34ea6de3799b2e665e210fc5fe1c101b2e860 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/models.py @@ -0,0 +1,141 @@ +# Natural Language Toolkit: Language Models +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# Manu Joseph +# URL: +# For license information, see LICENSE.TXT +"""Language Models""" + +from nltk.lm.api import LanguageModel, Smoothing +from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell + + +class MLE(LanguageModel): + """Class for providing MLE ngram model scores. + + Inherits initialization from BaseNgramModel. + """ + + def unmasked_score(self, word, context=None): + """Returns the MLE score for a word given a context. + + Args: + - word is expected to be a string + - context is expected to be something reasonably convertible to a tuple + """ + return self.context_counts(context).freq(word) + + +class Lidstone(LanguageModel): + """Provides Lidstone-smoothed scores. + + In addition to initialization arguments from BaseNgramModel also requires + a number by which to increase the counts, gamma. + """ + + def __init__(self, gamma, *args, **kwargs): + super().__init__(*args, **kwargs) + self.gamma = gamma + + def unmasked_score(self, word, context=None): + """Add-one smoothing: Lidstone or Laplace. + + To see what kind, look at `gamma` attribute on the class. + + """ + counts = self.context_counts(context) + word_count = counts[word] + norm_count = counts.N() + return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma) + + +class Laplace(Lidstone): + """Implements Laplace (add one) smoothing. + + Initialization identical to BaseNgramModel because gamma is always 1. + """ + + def __init__(self, *args, **kwargs): + super().__init__(1, *args, **kwargs) + + +class StupidBackoff(LanguageModel): + """Provides StupidBackoff scores. + + In addition to initialization arguments from BaseNgramModel also requires + a parameter alpha with which we scale the lower order probabilities. + Note that this is not a true probability distribution as scores for ngrams + of the same order do not sum up to unity. + """ + + def __init__(self, alpha=0.4, *args, **kwargs): + super().__init__(*args, **kwargs) + self.alpha = alpha + + def unmasked_score(self, word, context=None): + if not context: + # Base recursion + return self.counts.unigrams.freq(word) + counts = self.context_counts(context) + word_count = counts[word] + norm_count = counts.N() + if word_count > 0: + return word_count / norm_count + else: + return self.alpha * self.unmasked_score(word, context[1:]) + + +class InterpolatedLanguageModel(LanguageModel): + """Logic common to all interpolated language models. + + The idea to abstract this comes from Chen & Goodman 1995. + Do not instantiate this class directly! + """ + + def __init__(self, smoothing_cls, order, **kwargs): + params = kwargs.pop("params", {}) + super().__init__(order, **kwargs) + self.estimator = smoothing_cls(self.vocab, self.counts, **params) + + def unmasked_score(self, word, context=None): + if not context: + # The base recursion case: no context, we only have a unigram. + return self.estimator.unigram_score(word) + if not self.counts[context]: + # It can also happen that we have no data for this context. + # In that case we defer to the lower-order ngram. + # This is the same as setting alpha to 0 and gamma to 1. + alpha, gamma = 0, 1 + else: + alpha, gamma = self.estimator.alpha_gamma(word, context) + return alpha + gamma * self.unmasked_score(word, context[1:]) + + +class WittenBellInterpolated(InterpolatedLanguageModel): + """Interpolated version of Witten-Bell smoothing.""" + + def __init__(self, order, **kwargs): + super().__init__(WittenBell, order, **kwargs) + + +class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel): + """Interpolated version of smoothing with absolute discount.""" + + def __init__(self, order, discount=0.75, **kwargs): + super().__init__( + AbsoluteDiscounting, order, params={"discount": discount}, **kwargs + ) + + +class KneserNeyInterpolated(InterpolatedLanguageModel): + """Interpolated version of Kneser-Ney smoothing.""" + + def __init__(self, order, discount=0.1, **kwargs): + if not (0 <= discount <= 1): + raise ValueError( + "Discount must be between 0 and 1 for probabilities to sum to unity." + ) + super().__init__( + KneserNey, order, params={"discount": discount, "order": order}, **kwargs + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/preprocessing.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..f35051a3d86af8ee965a3d10c6da0e49535b5a6f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/preprocessing.py @@ -0,0 +1,51 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# URL: +# For license information, see LICENSE.TXT +from functools import partial +from itertools import chain + +from nltk.util import everygrams, pad_sequence + +flatten = chain.from_iterable +pad_both_ends = partial( + pad_sequence, + pad_left=True, + left_pad_symbol="", + pad_right=True, + right_pad_symbol="", +) +pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order. + + Following convention pads the start of sentence pads its end. + """ + + +def padded_everygrams(order, sentence): + """Helper with some useful defaults. + + Applies pad_both_ends to sentence and follows it up with everygrams. + """ + return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order) + + +def padded_everygram_pipeline(order, text): + """Default preprocessing for a sequence of sentences. + + Creates two iterators: + + - sentences padded and turned into sequences of `nltk.util.everygrams` + - sentences padded as above and chained together for a flat stream of words + + :param order: Largest ngram length produced by `everygrams`. + :param text: Text to iterate over. Expected to be an iterable of sentences. + :type text: Iterable[Iterable[str]] + :return: iterator over text as ngrams, iterator over text as vocabulary data + """ + padding_fn = partial(pad_both_ends, n=order) + return ( + (everygrams(list(padding_fn(sent)), max_len=order) for sent in text), + flatten(map(padding_fn, text)), + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/lm/smoothing.py b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/smoothing.py new file mode 100644 index 0000000000000000000000000000000000000000..b9c4cdb3d63e9b1ade5e25ae5a360fc3e524b337 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/lm/smoothing.py @@ -0,0 +1,127 @@ +# Natural Language Toolkit: Language Model Unit Tests +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ilia Kurenkov +# Manu Joseph +# URL: +# For license information, see LICENSE.TXT +"""Smoothing algorithms for language modeling. + +According to Chen & Goodman 1995 these should work with both Backoff and +Interpolation. +""" +from operator import methodcaller + +from nltk.lm.api import Smoothing +from nltk.probability import ConditionalFreqDist + + +def _count_values_gt_zero(distribution): + """Count values that are greater than zero in a distribution. + + Assumes distribution is either a mapping with counts as values or + an instance of `nltk.ConditionalFreqDist`. + """ + as_count = ( + methodcaller("N") + if isinstance(distribution, ConditionalFreqDist) + else lambda count: count + ) + # We explicitly check that values are > 0 to guard against negative counts. + return sum( + 1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0 + ) + + +class WittenBell(Smoothing): + """Witten-Bell smoothing.""" + + def __init__(self, vocabulary, counter, **kwargs): + super().__init__(vocabulary, counter, **kwargs) + + def alpha_gamma(self, word, context): + alpha = self.counts[context].freq(word) + gamma = self._gamma(context) + return (1.0 - gamma) * alpha, gamma + + def _gamma(self, context): + n_plus = _count_values_gt_zero(self.counts[context]) + return n_plus / (n_plus + self.counts[context].N()) + + def unigram_score(self, word): + return self.counts.unigrams.freq(word) + + +class AbsoluteDiscounting(Smoothing): + """Smoothing with absolute discount.""" + + def __init__(self, vocabulary, counter, discount=0.75, **kwargs): + super().__init__(vocabulary, counter, **kwargs) + self.discount = discount + + def alpha_gamma(self, word, context): + alpha = ( + max(self.counts[context][word] - self.discount, 0) + / self.counts[context].N() + ) + gamma = self._gamma(context) + return alpha, gamma + + def _gamma(self, context): + n_plus = _count_values_gt_zero(self.counts[context]) + return (self.discount * n_plus) / self.counts[context].N() + + def unigram_score(self, word): + return self.counts.unigrams.freq(word) + + +class KneserNey(Smoothing): + """Kneser-Ney Smoothing. + + This is an extension of smoothing with a discount. + + Resources: + - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf + - https://www.youtube.com/watch?v=ody1ysUTD7o + - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8 + - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf + - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf + """ + + def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs): + super().__init__(vocabulary, counter, **kwargs) + self.discount = discount + self._order = order + + def unigram_score(self, word): + word_continuation_count, total_count = self._continuation_counts(word) + return word_continuation_count / total_count + + def alpha_gamma(self, word, context): + prefix_counts = self.counts[context] + word_continuation_count, total_count = ( + (prefix_counts[word], prefix_counts.N()) + if len(context) + 1 == self._order + else self._continuation_counts(word, context) + ) + alpha = max(word_continuation_count - self.discount, 0.0) / total_count + gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count + return alpha, gamma + + def _continuation_counts(self, word, context=tuple()): + """Count continuations that end with context and word. + + Continuations track unique ngram "types", regardless of how many + instances were observed for each "type". + This is different than raw ngram counts which track number of instances. + """ + higher_order_ngrams_with_context = ( + counts + for prefix_ngram, counts in self.counts[len(context) + 2].items() + if prefix_ngram[1:] == context + ) + higher_order_ngrams_with_word_count, total = 0, 0 + for counts in higher_order_ngrams_with_context: + higher_order_ngrams_with_word_count += int(counts[word] > 0) + total += _count_values_gt_zero(counts) + return higher_order_ngrams_with_word_count, total diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0f84a8e8e10a267b24c2de0a4af787bd36c79e9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/__init__.py @@ -0,0 +1,51 @@ +# Natural Language Toolkit: Metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +# + +""" +NLTK Metrics + +Classes and methods for scoring processing modules. +""" + +from nltk.metrics.agreement import AnnotationTask +from nltk.metrics.aline import align +from nltk.metrics.association import ( + BigramAssocMeasures, + ContingencyMeasures, + NgramAssocMeasures, + QuadgramAssocMeasures, + TrigramAssocMeasures, +) +from nltk.metrics.confusionmatrix import ConfusionMatrix +from nltk.metrics.distance import ( + binary_distance, + custom_distance, + edit_distance, + edit_distance_align, + fractional_presence, + interval_distance, + jaccard_distance, + masi_distance, + presence, +) +from nltk.metrics.paice import Paice +from nltk.metrics.scores import ( + accuracy, + approxrand, + f_measure, + log_likelihood, + precision, + recall, +) +from nltk.metrics.segmentation import ghd, pk, windowdiff +from nltk.metrics.spearman import ( + ranks_from_scores, + ranks_from_sequence, + spearman_correlation, +) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/agreement.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/agreement.py new file mode 100644 index 0000000000000000000000000000000000000000..5f0afee1a64da98775ceac2fc7036c1c226fa6c2 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/agreement.py @@ -0,0 +1,465 @@ +# Natural Language Toolkit: Agreement Metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Tom Lippincott +# URL: +# For license information, see LICENSE.TXT +# + +""" +Implementations of inter-annotator agreement coefficients surveyed by Artstein +and Poesio (2007), Inter-Coder Agreement for Computational Linguistics. + +An agreement coefficient calculates the amount that annotators agreed on label +assignments beyond what is expected by chance. + +In defining the AnnotationTask class, we use naming conventions similar to the +paper's terminology. There are three types of objects in an annotation task: + + the coders (variables "c" and "C") + the items to be annotated (variables "i" and "I") + the potential categories to be assigned (variables "k" and "K") + +Additionally, it is often the case that we don't want to treat two different +labels as complete disagreement, and so the AnnotationTask constructor can also +take a distance metric as a final argument. Distance metrics are simply +functions that take two arguments, and return a value between 0.0 and 1.0 +indicating the distance between them. If not supplied, the default is binary +comparison between the arguments. + +The simplest way to initialize an AnnotationTask is with a list of triples, +each containing a coder's assignment for one object in the task: + + task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...]) + +Note that the data list needs to contain the same number of triples for each +individual coder, containing category values for the same set of items. + +Alpha (Krippendorff 1980) +Kappa (Cohen 1960) +S (Bennet, Albert and Goldstein 1954) +Pi (Scott 1955) + + +TODO: Describe handling of multiple coders and missing data + +Expected results from the Artstein and Poesio survey paper: + + >>> from nltk.metrics.agreement import AnnotationTask + >>> import os.path + >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))]) + >>> t.avg_Ao() + 0.88 + >>> round(t.pi(), 5) + 0.79953 + >>> round(t.S(), 2) + 0.82 + + This would have returned a wrong value (0.0) in @785fb79 as coders are in + the wrong order. Subsequently, all values for pi(), S(), and kappa() would + have been wrong as they are computed with avg_Ao(). + >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')]) + >>> t2.avg_Ao() + 1.0 + + The following, of course, also works. + >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')]) + >>> t3.avg_Ao() + 1.0 + +""" + +import logging +from itertools import groupby +from operator import itemgetter + +from nltk.internals import deprecated +from nltk.metrics.distance import binary_distance +from nltk.probability import ConditionalFreqDist, FreqDist + +log = logging.getLogger(__name__) + + +class AnnotationTask: + """Represents an annotation task, i.e. people assign labels to items. + + Notation tries to match notation in Artstein and Poesio (2007). + + In general, coders and items can be represented as any hashable object. + Integers, for example, are fine, though strings are more readable. + Labels must support the distance functions applied to them, so e.g. + a string-edit-distance makes no sense if your labels are integers, + whereas interval distance needs numeric values. A notable case of this + is the MASI metric, which requires Python sets. + """ + + def __init__(self, data=None, distance=binary_distance): + """Initialize an annotation task. + + The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples, + each representing a coder's labeling of an item: + ``(coder,item,label)`` + + The distance argument is a function taking two arguments (labels) and producing a numerical distance. + The distance from a label to itself should be zero: + ``distance(l,l) = 0`` + """ + self.distance = distance + self.I = set() + self.K = set() + self.C = set() + self.data = [] + if data is not None: + self.load_array(data) + + def __str__(self): + return "\r\n".join( + map( + lambda x: "%s\t%s\t%s" + % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])), + self.data, + ) + ) + + def load_array(self, array): + """Load an sequence of annotation results, appending to any data already loaded. + + The argument is a sequence of 3-tuples, each representing a coder's labeling of an item: + (coder,item,label) + """ + for coder, item, labels in array: + self.C.add(coder) + self.K.add(labels) + self.I.add(item) + self.data.append({"coder": coder, "labels": labels, "item": item}) + + def agr(self, cA, cB, i, data=None): + """Agreement between two coders on a given item""" + data = data or self.data + # cfedermann: we don't know what combination of coder/item will come + # first in x; to avoid StopIteration problems due to assuming an order + # cA,cB, we allow either for k1 and then look up the missing as k2. + k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i) + if k1["coder"] == cA: + k2 = next(x for x in data if x["coder"] == cB and x["item"] == i) + else: + k2 = next(x for x in data if x["coder"] == cA and x["item"] == i) + + ret = 1.0 - float(self.distance(k1["labels"], k2["labels"])) + log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret) + log.debug( + 'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret + ) + return ret + + def Nk(self, k): + return float(sum(1 for x in self.data if x["labels"] == k)) + + def Nik(self, i, k): + return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k)) + + def Nck(self, c, k): + return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k)) + + @deprecated("Use Nk, Nik or Nck instead") + def N(self, k=None, i=None, c=None): + """Implements the "n-notation" used in Artstein and Poesio (2007)""" + if k is not None and i is None and c is None: + ret = self.Nk(k) + elif k is not None and i is not None and c is None: + ret = self.Nik(i, k) + elif k is not None and c is not None and i is None: + ret = self.Nck(c, k) + else: + raise ValueError( + f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})" + ) + log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret) + return ret + + def _grouped_data(self, field, data=None): + data = data or self.data + return groupby(sorted(data, key=itemgetter(field)), itemgetter(field)) + + def Ao(self, cA, cB): + """Observed agreement between two coders on all items.""" + data = self._grouped_data( + "item", (x for x in self.data if x["coder"] in (cA, cB)) + ) + ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len( + self.I + ) + log.debug("Observed agreement between %s and %s: %f", cA, cB, ret) + return ret + + def _pairwise_average(self, function): + """ + Calculates the average of function results for each coder pair + """ + total = 0 + n = 0 + s = self.C.copy() + for cA in self.C: + s.remove(cA) + for cB in s: + total += function(cA, cB) + n += 1 + ret = total / n + return ret + + def avg_Ao(self): + """Average observed agreement across all coders and items.""" + ret = self._pairwise_average(self.Ao) + log.debug("Average observed agreement: %f", ret) + return ret + + def Do_Kw_pairwise(self, cA, cB, max_distance=1.0): + """The observed disagreement for the weighted kappa coefficient.""" + total = 0.0 + data = (x for x in self.data if x["coder"] in (cA, cB)) + for i, itemdata in self._grouped_data("item", data): + # we should have two items; distance doesn't care which comes first + total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"]) + + ret = total / (len(self.I) * max_distance) + log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret) + return ret + + def Do_Kw(self, max_distance=1.0): + """Averaged over all labelers""" + ret = self._pairwise_average( + lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance) + ) + log.debug("Observed disagreement: %f", ret) + return ret + + # Agreement Coefficients + def S(self): + """Bennett, Albert and Goldstein 1954""" + Ae = 1.0 / len(self.K) + ret = (self.avg_Ao() - Ae) / (1.0 - Ae) + return ret + + def pi(self): + """Scott 1955; here, multi-pi. + Equivalent to K from Siegel and Castellan (1988). + + """ + total = 0.0 + label_freqs = FreqDist(x["labels"] for x in self.data) + for k, f in label_freqs.items(): + total += f**2 + Ae = total / ((len(self.I) * len(self.C)) ** 2) + return (self.avg_Ao() - Ae) / (1 - Ae) + + def Ae_kappa(self, cA, cB): + Ae = 0.0 + nitems = float(len(self.I)) + label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data) + for k in label_freqs.conditions(): + Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) + return Ae + + def kappa_pairwise(self, cA, cB): + """ """ + Ae = self.Ae_kappa(cA, cB) + ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae) + log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae) + return ret + + def kappa(self): + """Cohen 1960 + Averages naively over kappas for each coder pair. + + """ + return self._pairwise_average(self.kappa_pairwise) + + def multi_kappa(self): + """Davies and Fleiss 1982 + Averages over observed and expected agreements for each coder pair. + + """ + Ae = self._pairwise_average(self.Ae_kappa) + return (self.avg_Ao() - Ae) / (1.0 - Ae) + + def Disagreement(self, label_freqs): + total_labels = sum(label_freqs.values()) + pairs = 0.0 + for j, nj in label_freqs.items(): + for l, nl in label_freqs.items(): + pairs += float(nj * nl) * self.distance(l, j) + return 1.0 * pairs / (total_labels * (total_labels - 1)) + + def alpha(self): + """Krippendorff 1980""" + # check for degenerate cases + if len(self.K) == 0: + raise ValueError("Cannot calculate alpha, no data present!") + if len(self.K) == 1: + log.debug("Only one annotation value, alpha returning 1.") + return 1 + if len(self.C) == 1 and len(self.I) == 1: + raise ValueError("Cannot calculate alpha, only one coder and item present!") + + total_disagreement = 0.0 + total_ratings = 0 + all_valid_labels_freq = FreqDist([]) + + total_do = 0.0 # Total observed disagreement for all items. + for i, itemdata in self._grouped_data("item"): + label_freqs = FreqDist(x["labels"] for x in itemdata) + labels_count = sum(label_freqs.values()) + if labels_count < 2: + # Ignore the item. + continue + all_valid_labels_freq += label_freqs + total_do += self.Disagreement(label_freqs) * labels_count + + do = total_do / sum(all_valid_labels_freq.values()) + + de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. + k_alpha = 1.0 - do / de + + return k_alpha + + def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): + """Cohen 1968""" + total = 0.0 + label_freqs = ConditionalFreqDist( + (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB) + ) + for j in self.K: + for l in self.K: + total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) + De = total / (max_distance * pow(len(self.I), 2)) + log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) + Do = self.Do_Kw_pairwise(cA, cB) + ret = 1.0 - (Do / De) + return ret + + def weighted_kappa(self, max_distance=1.0): + """Cohen 1968""" + return self._pairwise_average( + lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance) + ) + + +if __name__ == "__main__": + + import optparse + import re + + from nltk.metrics import distance + + # process command-line arguments + parser = optparse.OptionParser() + parser.add_option( + "-d", + "--distance", + dest="distance", + default="binary_distance", + help="distance metric to use", + ) + parser.add_option( + "-a", + "--agreement", + dest="agreement", + default="kappa", + help="agreement coefficient to calculate", + ) + parser.add_option( + "-e", + "--exclude", + dest="exclude", + action="append", + default=[], + help="coder names to exclude (may be specified multiple times)", + ) + parser.add_option( + "-i", + "--include", + dest="include", + action="append", + default=[], + help="coder names to include, same format as exclude", + ) + parser.add_option( + "-f", + "--file", + dest="file", + help="file to read labelings from, each line with three columns: 'labeler item labels'", + ) + parser.add_option( + "-v", + "--verbose", + dest="verbose", + default="0", + help="how much debugging to print on stderr (0-4)", + ) + parser.add_option( + "-c", + "--columnsep", + dest="columnsep", + default="\t", + help="char/string that separates the three columns in the file, defaults to tab", + ) + parser.add_option( + "-l", + "--labelsep", + dest="labelsep", + default=",", + help="char/string that separates labels (if labelers can assign more than one), defaults to comma", + ) + parser.add_option( + "-p", + "--presence", + dest="presence", + default=None, + help="convert each labeling into 1 or 0, based on presence of LABEL", + ) + parser.add_option( + "-T", + "--thorough", + dest="thorough", + default=False, + action="store_true", + help="calculate agreement for every subset of the annotators", + ) + (options, remainder) = parser.parse_args() + + if not options.file: + parser.print_help() + exit() + + logging.basicConfig(level=50 - 10 * int(options.verbose)) + + # read in data from the specified file + data = [] + with open(options.file) as infile: + for l in infile: + toks = l.split(options.columnsep) + coder, object_, labels = ( + toks[0], + str(toks[1:-1]), + frozenset(toks[-1].strip().split(options.labelsep)), + ) + if ( + (options.include == options.exclude) + or (len(options.include) > 0 and coder in options.include) + or (len(options.exclude) > 0 and coder not in options.exclude) + ): + data.append((coder, object_, labels)) + + if options.presence: + task = AnnotationTask( + data, getattr(distance, options.distance)(options.presence) + ) + else: + task = AnnotationTask(data, getattr(distance, options.distance)) + + if options.thorough: + pass + else: + print(getattr(task, options.agreement)()) + + logging.shutdown() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/aline.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/aline.py new file mode 100644 index 0000000000000000000000000000000000000000..d90b3156eedaff154e65941ed4187349ccfd85c5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/aline.py @@ -0,0 +1,1354 @@ +# Natural Language Toolkit: ALINE +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Greg Kondrak +# Geoff Bacon (Python port) +# URL: +# For license information, see LICENSE.TXT + +""" +ALINE +https://webdocs.cs.ualberta.ca/~kondrak/ +Copyright 2002 by Grzegorz Kondrak. + +ALINE is an algorithm for aligning phonetic sequences, described in [1]. +This module is a port of Kondrak's (2002) ALINE. It provides functions for +phonetic sequence alignment and similarity analysis. These are useful in +historical linguistics, sociolinguistics and synchronic phonology. + +ALINE has parameters that can be tuned for desired output. These parameters are: +- C_skip, C_sub, C_exp, C_vwl +- Salience weights +- Segmental features + +In this implementation, some parameters have been changed from their default +values as described in [1], in order to replicate published results. All changes +are noted in comments. + +Example usage +------------- + +# Get optimal alignment of two phonetic sequences + +>>> align('θin', 'tenwis') # doctest: +SKIP +[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]] + +[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, +University of Toronto. +""" + +try: + import numpy as np +except ImportError: + np = None + +# === Constants === + +inf = float("inf") + +# Default values for maximum similarity scores (Kondrak 2002: 54) +C_skip = -10 # Indels +C_sub = 35 # Substitutions +C_exp = 45 # Expansions/compressions +C_vwl = 5 # Vowel/consonant relative weight (decreased from 10) + +consonants = [ + "B", + "N", + "R", + "b", + "c", + "d", + "f", + "g", + "h", + "j", + "k", + "l", + "m", + "n", + "p", + "q", + "r", + "s", + "t", + "v", + "x", + "z", + "ç", + "ð", + "ħ", + "ŋ", + "ɖ", + "ɟ", + "ɢ", + "ɣ", + "ɦ", + "ɬ", + "ɮ", + "ɰ", + "ɱ", + "ɲ", + "ɳ", + "ɴ", + "ɸ", + "ɹ", + "ɻ", + "ɽ", + "ɾ", + "ʀ", + "ʁ", + "ʂ", + "ʃ", + "ʈ", + "ʋ", + "ʐ ", + "ʒ", + "ʔ", + "ʕ", + "ʙ", + "ʝ", + "β", + "θ", + "χ", + "ʐ", + "w", +] + +# Relevant features for comparing consonants and vowels +R_c = [ + "aspirated", + "lateral", + "manner", + "nasal", + "place", + "retroflex", + "syllabic", + "voice", +] +# 'high' taken out of R_v because same as manner +R_v = [ + "back", + "lateral", + "long", + "manner", + "nasal", + "place", + "retroflex", + "round", + "syllabic", + "voice", +] + +# Flattened feature matrix (Kondrak 2002: 56) +similarity_matrix = { + # place + "bilabial": 1.0, + "labiodental": 0.95, + "dental": 0.9, + "alveolar": 0.85, + "retroflex": 0.8, + "palato-alveolar": 0.75, + "palatal": 0.7, + "velar": 0.6, + "uvular": 0.5, + "pharyngeal": 0.3, + "glottal": 0.1, + "labiovelar": 1.0, + "vowel": -1.0, # added 'vowel' + # manner + "stop": 1.0, + "affricate": 0.9, + "fricative": 0.85, # increased fricative from 0.8 + "trill": 0.7, + "tap": 0.65, + "approximant": 0.6, + "high vowel": 0.4, + "mid vowel": 0.2, + "low vowel": 0.0, + "vowel2": 0.5, # added vowel + # high + "high": 1.0, + "mid": 0.5, + "low": 0.0, + # back + "front": 1.0, + "central": 0.5, + "back": 0.0, + # binary features + "plus": 1.0, + "minus": 0.0, +} + +# Relative weights of phonetic features (Kondrak 2002: 55) +salience = { + "syllabic": 5, + "place": 40, + "manner": 50, + "voice": 5, # decreased from 10 + "nasal": 20, # increased from 10 + "retroflex": 10, + "lateral": 10, + "aspirated": 5, + "long": 0, # decreased from 1 + "high": 3, # decreased from 5 + "back": 2, # decreased from 5 + "round": 2, # decreased from 5 +} + +# (Kondrak 2002: 59-60) +feature_matrix = { + # Consonants + "p": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "b": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "t": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "d": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʈ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɖ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "c": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɟ": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "k": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "g": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "q": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɢ": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʔ": { + "place": "glottal", + "manner": "stop", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "m": { + "place": "bilabial", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɱ": { + "place": "labiodental", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "n": { + "place": "alveolar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɳ": { + "place": "retroflex", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɲ": { + "place": "palatal", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ŋ": { + "place": "velar", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɴ": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "N": { + "place": "uvular", + "manner": "stop", + "syllabic": "minus", + "voice": "plus", + "nasal": "plus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʙ": { + "place": "bilabial", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "B": { + "place": "bilabial", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "r": { + "place": "alveolar", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʀ": { + "place": "uvular", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "R": { + "place": "uvular", + "manner": "trill", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɾ": { + "place": "alveolar", + "manner": "tap", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɽ": { + "place": "retroflex", + "manner": "tap", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɸ": { + "place": "bilabial", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "β": { + "place": "bilabial", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "f": { + "place": "labiodental", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "v": { + "place": "labiodental", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "θ": { + "place": "dental", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ð": { + "place": "dental", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "s": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "z": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʃ": { + "place": "palato-alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʒ": { + "place": "palato-alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʂ": { + "place": "retroflex", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʐ": { + "place": "retroflex", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "ç": { + "place": "palatal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʝ": { + "place": "palatal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "x": { + "place": "velar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɣ": { + "place": "velar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "χ": { + "place": "uvular", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʁ": { + "place": "uvular", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ħ": { + "place": "pharyngeal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ʕ": { + "place": "pharyngeal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "h": { + "place": "glottal", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɦ": { + "place": "glottal", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɬ": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "minus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus", + }, + "ɮ": { + "place": "alveolar", + "manner": "fricative", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus", + }, + "ʋ": { + "place": "labiodental", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɹ": { + "place": "alveolar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɻ": { + "place": "retroflex", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "plus", + "lateral": "minus", + "aspirated": "minus", + }, + "j": { + "place": "palatal", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "ɰ": { + "place": "velar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + "l": { + "place": "alveolar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "plus", + "aspirated": "minus", + }, + "w": { + "place": "labiovelar", + "manner": "approximant", + "syllabic": "minus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "aspirated": "minus", + }, + # Vowels + "i": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "y": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "e": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "E": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus", + }, + "ø": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "ɛ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "œ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "front", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "æ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "a": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "A": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus", + }, + "ɨ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "central", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "ʉ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "central", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "ə": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "central", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "u": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "U": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "back", + "round": "plus", + "long": "plus", + "aspirated": "minus", + }, + "o": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "O": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "plus", + "aspirated": "minus", + }, + "ɔ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "mid", + "back": "back", + "round": "plus", + "long": "minus", + "aspirated": "minus", + }, + "ɒ": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "low", + "back": "back", + "round": "minus", + "long": "minus", + "aspirated": "minus", + }, + "I": { + "place": "vowel", + "manner": "vowel2", + "syllabic": "plus", + "voice": "plus", + "nasal": "minus", + "retroflex": "minus", + "lateral": "minus", + "high": "high", + "back": "front", + "round": "minus", + "long": "plus", + "aspirated": "minus", + }, +} + +# === Algorithm === + + +def align(str1, str2, epsilon=0): + """ + Compute the alignment of two phonetic strings. + + :param str str1: First string to be aligned + :param str str2: Second string to be aligned + + :type epsilon: float (0.0 to 1.0) + :param epsilon: Adjusts threshold similarity score for near-optimal alignments + + :rtype: list(list(tuple(str, str))) + :return: Alignment(s) of str1 and str2 + + (Kondrak 2002: 51) + """ + if np is None: + raise ImportError("You need numpy in order to use the align function") + + assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0." + m = len(str1) + n = len(str2) + # This includes Kondrak's initialization of row 0 and column 0 to all 0s. + S = np.zeros((m + 1, n + 1), dtype=float) + + # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense, + # and breaks array and string indices. Make sure they never get chosen + # by setting them to -inf. + for i in range(1, m + 1): + for j in range(1, n + 1): + edit1 = S[i - 1, j] + sigma_skip(str1[i - 1]) + edit2 = S[i, j - 1] + sigma_skip(str2[j - 1]) + edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + if i > 1: + edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + else: + edit4 = -inf + if j > 1: + edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + else: + edit5 = -inf + S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0) + + T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments + + alignments = [] + for i in range(1, m + 1): + for j in range(1, n + 1): + if S[i, j] >= T: + alignments.append(_retrieve(i, j, 0, S, T, str1, str2, [])) + return alignments + + +def _retrieve(i, j, s, S, T, str1, str2, out): + """ + Retrieve the path through the similarity matrix S starting at (i, j). + + :rtype: list(tuple(str, str)) + :return: Alignment of str1 and str2 + """ + if S[i, j] == 0: + return out + else: + if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T: + out.insert(0, (str1[i - 1], str2[j - 2 : j])) + _retrieve( + i - 1, + j - 2, + s + sigma_exp(str1[i - 1], str2[j - 2 : j]), + S, + T, + str1, + str2, + out, + ) + elif ( + i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T + ): + out.insert(0, (str1[i - 2 : i], str2[j - 1])) + _retrieve( + i - 2, + j - 1, + s + sigma_exp(str2[j - 1], str1[i - 2 : i]), + S, + T, + str1, + str2, + out, + ) + elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T: + out.insert(0, ("-", str2[j - 1])) + _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out) + elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T: + out.insert(0, (str1[i - 1], "-")) + _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out) + elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T: + out.insert(0, (str1[i - 1], str2[j - 1])) + _retrieve( + i - 1, + j - 1, + s + sigma_sub(str1[i - 1], str2[j - 1]), + S, + T, + str1, + str2, + out, + ) + return out + + +def sigma_skip(p): + """ + Returns score of an indel of P. + + (Kondrak 2002: 54) + """ + return C_skip + + +def sigma_sub(p, q): + """ + Returns score of a substitution of P with Q. + + (Kondrak 2002: 54) + """ + return C_sub - delta(p, q) - V(p) - V(q) + + +def sigma_exp(p, q): + """ + Returns score of an expansion/compression. + + (Kondrak 2002: 54) + """ + q1 = q[0] + q2 = q[1] + return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2)) + + +def delta(p, q): + """ + Return weighted sum of difference between P and Q. + + (Kondrak 2002: 54) + """ + features = R(p, q) + total = 0 + for f in features: + total += diff(p, q, f) * salience[f] + return total + + +def diff(p, q, f): + """ + Returns difference between phonetic segments P and Q for feature F. + + (Kondrak 2002: 52, 54) + """ + p_features, q_features = feature_matrix[p], feature_matrix[q] + return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]]) + + +def R(p, q): + """ + Return relevant features for segment comparison. + + (Kondrak 2002: 54) + """ + if p in consonants or q in consonants: + return R_c + return R_v + + +def V(p): + """ + Return vowel weight if P is vowel. + + (Kondrak 2002: 54) + """ + if p in consonants: + return 0 + return C_vwl + + +# === Test === + + +def demo(): + """ + A demonstration of the result of aligning phonetic sequences + used in Kondrak's (2002) dissertation. + """ + data = [pair.split(",") for pair in cognate_data.split("\n")] + for pair in data: + alignment = align(pair[0], pair[1])[0] + alignment = [f"({a[0]}, {a[1]})" for a in alignment] + alignment = " ".join(alignment) + print(f"{pair[0]} ~ {pair[1]} : {alignment}") + + +cognate_data = """jo,ʒə +tu,ty +nosotros,nu +kjen,ki +ke,kwa +todos,tu +una,ən +dos,dø +tres,trwa +ombre,om +arbol,arbrə +pluma,plym +kabeθa,kap +boka,buʃ +pje,pje +koraθon,kœr +ber,vwar +benir,vənir +deθir,dir +pobre,povrə +ðis,dIzes +ðæt,das +wat,vas +nat,nixt +loŋ,laŋ +mæn,man +fleʃ,flajʃ +bləd,blyt +feðər,fEdər +hær,hAr +ir,Or +aj,awgə +nowz,nAzə +mawθ,munt +təŋ,tsuŋə +fut,fys +nij,knI +hænd,hant +hart,herts +livər,lEbər +ænd,ante +æt,ad +blow,flAre +ir,awris +ijt,edere +fiʃ,piʃkis +flow,fluere +staɾ,stella +ful,plenus +græs,gramen +hart,kordis +horn,korny +aj,ego +nij,genU +məðər,mAter +mawntən,mons +nejm,nomen +njuw,nowus +wən,unus +rawnd,rotundus +sow,suere +sit,sedere +θrij,tres +tuwθ,dentis +θin,tenwis +kinwawa,kenuaʔ +nina,nenah +napewa,napɛw +wapimini,wapemen +namesa,namɛʔs +okimawa,okemaw +ʃiʃipa,seʔsep +ahkohkwa,ahkɛh +pematesiweni,pematesewen +asenja,aʔsɛn""" + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/association.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/association.py new file mode 100644 index 0000000000000000000000000000000000000000..7bbdf563b9054fd44c55209444c500c9173abb10 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/association.py @@ -0,0 +1,476 @@ +# Natural Language Toolkit: Ngram Association Measures +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Joel Nothman +# URL: +# For license information, see LICENSE.TXT + +""" +Provides scoring functions for a number of association measures through a +generic, abstract implementation in ``NgramAssocMeasures``, and n-specific +``BigramAssocMeasures`` and ``TrigramAssocMeasures``. +""" + +import math as _math +from abc import ABCMeta, abstractmethod +from functools import reduce + +_log2 = lambda x: _math.log2(x) +_ln = _math.log + +_product = lambda s: reduce(lambda x, y: x * y, s) + +_SMALL = 1e-20 + +try: + from scipy.stats import fisher_exact +except ImportError: + + def fisher_exact(*_args, **_kwargs): + raise NotImplementedError + + +### Indices to marginals arguments: + +NGRAM = 0 +"""Marginals index for the ngram count""" + +UNIGRAMS = -2 +"""Marginals index for a tuple of each unigram count""" + +TOTAL = -1 +"""Marginals index for the number of words in the data""" + + +class NgramAssocMeasures(metaclass=ABCMeta): + """ + An abstract class defining a collection of generic association measures. + Each public method returns a score, taking the following arguments:: + + score_fn(count_of_ngram, + (count_of_n-1gram_1, ..., count_of_n-1gram_j), + (count_of_n-2gram_1, ..., count_of_n-2gram_k), + ..., + (count_of_1gram_1, ..., count_of_1gram_n), + count_of_total_words) + + See ``BigramAssocMeasures`` and ``TrigramAssocMeasures`` + + Inheriting classes should define a property _n, and a method _contingency + which calculates contingency values from marginals in order for all + association measures defined here to be usable. + """ + + _n = 0 + + @staticmethod + @abstractmethod + def _contingency(*marginals): + """Calculates values of a contingency table from marginal values.""" + raise NotImplementedError( + "The contingency table is not available" "in the general ngram case" + ) + + @staticmethod + @abstractmethod + def _marginals(*contingency): + """Calculates values of contingency table marginals from its values.""" + raise NotImplementedError( + "The contingency table is not available" "in the general ngram case" + ) + + @classmethod + def _expected_values(cls, cont): + """Calculates expected values for a contingency table.""" + n_all = sum(cont) + bits = [1 << i for i in range(cls._n)] + + # For each contingency table cell + for i in range(len(cont)): + # Yield the expected value + yield ( + _product( + sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j)) + for j in bits + ) + / (n_all ** (cls._n - 1)) + ) + + @staticmethod + def raw_freq(*marginals): + """Scores ngrams by their frequency""" + return marginals[NGRAM] / marginals[TOTAL] + + @classmethod + def student_t(cls, *marginals): + """Scores ngrams using Student's t test with independence hypothesis + for unigrams, as in Manning and Schutze 5.3.1. + """ + return ( + marginals[NGRAM] + - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) + ) / (marginals[NGRAM] + _SMALL) ** 0.5 + + @classmethod + def chi_sq(cls, *marginals): + """Scores ngrams using Pearson's chi-square as in Manning and Schutze + 5.3.3. + """ + cont = cls._contingency(*marginals) + exps = cls._expected_values(cont) + return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps)) + + @staticmethod + def mi_like(*marginals, **kwargs): + """Scores ngrams using a variant of mutual information. The keyword + argument power sets an exponent (default 3) for the numerator. No + logarithm of the result is calculated. + """ + return marginals[NGRAM] ** kwargs.get("power", 3) / _product( + marginals[UNIGRAMS] + ) + + @classmethod + def pmi(cls, *marginals): + """Scores ngrams by pointwise mutual information, as in Manning and + Schutze 5.4. + """ + return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2( + _product(marginals[UNIGRAMS]) + ) + + @classmethod + def likelihood_ratio(cls, *marginals): + """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.""" + cont = cls._contingency(*marginals) + return 2 * sum( + obs * _ln(obs / (exp + _SMALL) + _SMALL) + for obs, exp in zip(cont, cls._expected_values(cont)) + ) + + @classmethod + def poisson_stirling(cls, *marginals): + """Scores ngrams using the Poisson-Stirling measure.""" + exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) + return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1) + + @classmethod + def jaccard(cls, *marginals): + """Scores ngrams using the Jaccard index.""" + cont = cls._contingency(*marginals) + return cont[0] / sum(cont[:-1]) + + +class BigramAssocMeasures(NgramAssocMeasures): + """ + A collection of bigram association measures. Each association measure + is provided as a function with three arguments:: + + bigram_score_fn(n_ii, (n_ix, n_xi), n_xx) + + The arguments constitute the marginals of a contingency table, counting + the occurrences of particular events in a corpus. The letter i in the + suffix refers to the appearance of the word in question, while x indicates + the appearance of any word. Thus, for example: + + - n_ii counts ``(w1, w2)``, i.e. the bigram being scored + - n_ix counts ``(w1, *)`` + - n_xi counts ``(*, w2)`` + - n_xx counts ``(*, *)``, i.e. any bigram + + This may be shown with respect to a contingency table:: + + w1 ~w1 + ------ ------ + w2 | n_ii | n_oi | = n_xi + ------ ------ + ~w2 | n_io | n_oo | + ------ ------ + = n_ix TOTAL = n_xx + """ + + _n = 2 + + @staticmethod + def _contingency(n_ii, n_ix_xi_tuple, n_xx): + """Calculates values of a bigram contingency table from marginal values.""" + (n_ix, n_xi) = n_ix_xi_tuple + n_oi = n_xi - n_ii + n_io = n_ix - n_ii + return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io) + + @staticmethod + def _marginals(n_ii, n_oi, n_io, n_oo): + """Calculates values of contingency table marginals from its values.""" + return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii) + + @staticmethod + def _expected_values(cont): + """Calculates expected values for a contingency table.""" + n_xx = sum(cont) + # For each contingency table cell + for i in range(4): + yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx + + @classmethod + def phi_sq(cls, *marginals): + """Scores bigrams using phi-square, the square of the Pearson correlation + coefficient. + """ + n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) + + return (n_ii * n_oo - n_io * n_oi) ** 2 / ( + (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo) + ) + + @classmethod + def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx): + """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number + of bigrams, as in Manning and Schutze 5.3.3. + """ + (n_ix, n_xi) = n_ix_xi_tuple + return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx) + + @classmethod + def fisher(cls, *marginals): + """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less + sensitive to small counts than PMI or Chi Sq, but also more expensive + to compute. Requires scipy. + """ + + n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) + + (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less") + return pvalue + + @staticmethod + def dice(n_ii, n_ix_xi_tuple, n_xx): + """Scores bigrams using Dice's coefficient.""" + (n_ix, n_xi) = n_ix_xi_tuple + return 2 * n_ii / (n_ix + n_xi) + + +class TrigramAssocMeasures(NgramAssocMeasures): + """ + A collection of trigram association measures. Each association measure + is provided as a function with four arguments:: + + trigram_score_fn(n_iii, + (n_iix, n_ixi, n_xii), + (n_ixx, n_xix, n_xxi), + n_xxx) + + The arguments constitute the marginals of a contingency table, counting + the occurrences of particular events in a corpus. The letter i in the + suffix refers to the appearance of the word in question, while x indicates + the appearance of any word. Thus, for example: + + - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored + - n_ixx counts ``(w1, *, *)`` + - n_xxx counts ``(*, *, *)``, i.e. any trigram + """ + + _n = 3 + + @staticmethod + def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx): + """Calculates values of a trigram contingency table (or cube) from + marginal values. + >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000) + (1, 0, 0, 0, 0, 72, 0, 1927) + """ + (n_iix, n_ixi, n_xii) = n_iix_tuple + (n_ixx, n_xix, n_xxi) = n_ixx_tuple + n_oii = n_xii - n_iii + n_ioi = n_ixi - n_iii + n_iio = n_iix - n_iii + n_ooi = n_xxi - n_iii - n_oii - n_ioi + n_oio = n_xix - n_iii - n_oii - n_iio + n_ioo = n_ixx - n_iii - n_ioi - n_iio + n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo + + return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo) + + @staticmethod + def _marginals(*contingency): + """Calculates values of contingency table marginals from its values. + >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927) + (1, (1, 1, 1), (1, 73, 1), 2000) + """ + n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency + return ( + n_iii, + (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii), + ( + n_iii + n_ioi + n_iio + n_ioo, + n_iii + n_oii + n_iio + n_oio, + n_iii + n_oii + n_ioi + n_ooi, + ), + sum(contingency), + ) + + +class QuadgramAssocMeasures(NgramAssocMeasures): + """ + A collection of quadgram association measures. Each association measure + is provided as a function with five arguments:: + + trigram_score_fn(n_iiii, + (n_iiix, n_iixi, n_ixii, n_xiii), + (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), + (n_ixxx, n_xixx, n_xxix, n_xxxi), + n_all) + + The arguments constitute the marginals of a contingency table, counting + the occurrences of particular events in a corpus. The letter i in the + suffix refers to the appearance of the word in question, while x indicates + the appearance of any word. Thus, for example: + + - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored + - n_ixxi counts ``(w1, *, *, w4)`` + - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram + """ + + _n = 4 + + @staticmethod + def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx): + """Calculates values of a quadgram contingency table from + marginal values. + """ + (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple + (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple + (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple + n_oiii = n_xiii - n_iiii + n_ioii = n_ixii - n_iiii + n_iioi = n_iixi - n_iiii + n_ooii = n_xxii - n_iiii - n_oiii - n_ioii + n_oioi = n_xixi - n_iiii - n_oiii - n_iioi + n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi + n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi + n_iiio = n_iiix - n_iiii + n_oiio = n_xiix - n_iiii - n_oiii - n_iiio + n_ioio = n_ixix - n_iiii - n_ioii - n_iiio + n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio + n_iioo = n_iixx - n_iiii - n_iioi - n_iiio + n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo + n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio + n_oooo = ( + n_xxxx + - n_iiii + - n_oiii + - n_ioii + - n_iioi + - n_ooii + - n_oioi + - n_iooi + - n_oooi + - n_iiio + - n_oiio + - n_ioio + - n_ooio + - n_iioo + - n_oioo + - n_iooo + ) + + return ( + n_iiii, + n_oiii, + n_ioii, + n_ooii, + n_iioi, + n_oioi, + n_iooi, + n_oooi, + n_iiio, + n_oiio, + n_ioio, + n_ooio, + n_iioo, + n_oioo, + n_iooo, + n_oooo, + ) + + @staticmethod + def _marginals(*contingency): + """Calculates values of contingency table marginals from its values. + QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653) + (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540) + """ + ( + n_iiii, + n_oiii, + n_ioii, + n_ooii, + n_iioi, + n_oioi, + n_iooi, + n_oooi, + n_iiio, + n_oiio, + n_ioio, + n_ooio, + n_iioo, + n_oioo, + n_iooo, + n_oooo, + ) = contingency + + n_iiix = n_iiii + n_iiio + n_iixi = n_iiii + n_iioi + n_ixii = n_iiii + n_ioii + n_xiii = n_iiii + n_oiii + + n_iixx = n_iiii + n_iioi + n_iiio + n_iioo + n_ixix = n_iiii + n_ioii + n_iiio + n_ioio + n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi + n_xixi = n_iiii + n_oiii + n_iioi + n_oioi + n_xxii = n_iiii + n_oiii + n_ioii + n_ooii + n_xiix = n_iiii + n_oiii + n_iiio + n_oiio + + n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo + n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo + n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio + n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi + + n_all = sum(contingency) + + return ( + n_iiii, + (n_iiix, n_iixi, n_ixii, n_xiii), + (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), + (n_ixxx, n_xixx, n_xxix, n_xxxi), + n_all, + ) + + +class ContingencyMeasures: + """Wraps NgramAssocMeasures classes such that the arguments of association + measures are contingency table values rather than marginals. + """ + + def __init__(self, measures): + """Constructs a ContingencyMeasures given a NgramAssocMeasures class""" + self.__class__.__name__ = "Contingency" + measures.__class__.__name__ + for k in dir(measures): + if k.startswith("__"): + continue + v = getattr(measures, k) + if not k.startswith("_"): + v = self._make_contingency_fn(measures, v) + setattr(self, k, v) + + @staticmethod + def _make_contingency_fn(measures, old_fn): + """From an association measure function, produces a new function which + accepts contingency table values as its arguments. + """ + + def res(*contingency): + return old_fn(*measures._marginals(*contingency)) + + res.__doc__ = old_fn.__doc__ + res.__name__ = old_fn.__name__ + return res diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/confusionmatrix.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/confusionmatrix.py new file mode 100644 index 0000000000000000000000000000000000000000..5e0136df4d58a9fa845f98b240f9ca8622d218d5 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/confusionmatrix.py @@ -0,0 +1,353 @@ +# Natural Language Toolkit: Confusion Matrices +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Tom Aarsen <> +# URL: +# For license information, see LICENSE.TXT + +from nltk.probability import FreqDist + + +class ConfusionMatrix: + """ + The confusion matrix between a list of reference values and a + corresponding list of test values. Entry *[r,t]* of this + matrix is a count of the number of times that the reference value + *r* corresponds to the test value *t*. E.g.: + + >>> from nltk.metrics import ConfusionMatrix + >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() + >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() + >>> cm = ConfusionMatrix(ref, test) + >>> print(cm['NN', 'NN']) + 3 + + Note that the diagonal entries *Ri=Tj* of this matrix + corresponds to correct values; and the off-diagonal entries + correspond to incorrect values. + """ + + def __init__(self, reference, test, sort_by_count=False): + """ + Construct a new confusion matrix from a list of reference + values and a corresponding list of test values. + + :type reference: list + :param reference: An ordered list of reference values. + :type test: list + :param test: A list of values to compare against the + corresponding reference values. + :raise ValueError: If ``reference`` and ``length`` do not have + the same length. + """ + if len(reference) != len(test): + raise ValueError("Lists must have the same length.") + + # Get a list of all values. + if sort_by_count: + ref_fdist = FreqDist(reference) + test_fdist = FreqDist(test) + + def key(v): + return -(ref_fdist[v] + test_fdist[v]) + + values = sorted(set(reference + test), key=key) + else: + values = sorted(set(reference + test)) + + # Construct a value->index dictionary + indices = {val: i for (i, val) in enumerate(values)} + + # Make a confusion matrix table. + confusion = [[0 for _ in values] for _ in values] + max_conf = 0 # Maximum confusion + for w, g in zip(reference, test): + confusion[indices[w]][indices[g]] += 1 + max_conf = max(max_conf, confusion[indices[w]][indices[g]]) + + #: A list of all values in ``reference`` or ``test``. + self._values = values + #: A dictionary mapping values in ``self._values`` to their indices. + self._indices = indices + #: The confusion matrix itself (as a list of lists of counts). + self._confusion = confusion + #: The greatest count in ``self._confusion`` (used for printing). + self._max_conf = max_conf + #: The total number of values in the confusion matrix. + self._total = len(reference) + #: The number of correct (on-diagonal) values in the matrix. + self._correct = sum(confusion[i][i] for i in range(len(values))) + + def __getitem__(self, li_lj_tuple): + """ + :return: The number of times that value ``li`` was expected and + value ``lj`` was given. + :rtype: int + """ + (li, lj) = li_lj_tuple + i = self._indices[li] + j = self._indices[lj] + return self._confusion[i][j] + + def __repr__(self): + return f"" + + def __str__(self): + return self.pretty_format() + + def pretty_format( + self, + show_percents=False, + values_in_chart=True, + truncate=None, + sort_by_count=False, + ): + """ + :return: A multi-line string representation of this confusion matrix. + :type truncate: int + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. + :param sort_by_count: If true, then sort by the count of each + label in the reference data. I.e., labels that occur more + frequently in the reference label will be towards the left + edge of the matrix, and labels that occur less frequently + will be towards the right edge. + + @todo: add marginals? + """ + confusion = self._confusion + + values = self._values + if sort_by_count: + values = sorted( + values, key=lambda v: -sum(self._confusion[self._indices[v]]) + ) + + if truncate: + values = values[:truncate] + + if values_in_chart: + value_strings = ["%s" % val for val in values] + else: + value_strings = [str(n + 1) for n in range(len(values))] + + # Construct a format string for row values + valuelen = max(len(val) for val in value_strings) + value_format = "%" + repr(valuelen) + "s | " + # Construct a format string for matrix entries + if show_percents: + entrylen = 6 + entry_format = "%5.1f%%" + zerostr = " ." + else: + entrylen = len(repr(self._max_conf)) + entry_format = "%" + repr(entrylen) + "d" + zerostr = " " * (entrylen - 1) + "." + + # Write the column values. + s = "" + for i in range(valuelen): + s += (" " * valuelen) + " |" + for val in value_strings: + if i >= valuelen - len(val): + s += val[i - valuelen + len(val)].rjust(entrylen + 1) + else: + s += " " * (entrylen + 1) + s += " |\n" + + # Write a dividing line + s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) + + # Write the entries. + for val, li in zip(value_strings, values): + i = self._indices[li] + s += value_format % val + for lj in values: + j = self._indices[lj] + if confusion[i][j] == 0: + s += zerostr + elif show_percents: + s += entry_format % (100.0 * confusion[i][j] / self._total) + else: + s += entry_format % confusion[i][j] + if i == j: + prevspace = s.rfind(" ") + s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">" + else: + s += " " + s += "|\n" + + # Write a dividing line + s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) + + # Write a key + s += "(row = reference; col = test)\n" + if not values_in_chart: + s += "Value key:\n" + for i, value in enumerate(values): + s += "%6d: %s\n" % (i + 1, value) + + return s + + def key(self): + values = self._values + str = "Value key:\n" + indexlen = len(repr(len(values) - 1)) + key_format = " %" + repr(indexlen) + "d: %s\n" + for i in range(len(values)): + str += key_format % (i, values[i]) + + return str + + def recall(self, value): + """Given a value in the confusion matrix, return the recall + that corresponds to this value. The recall is defined as: + + - *r* = true positive / (true positive + false positive) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to how often ``value`` was + the true result. + + :param value: value used in the ConfusionMatrix + :return: the recall corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was correct + TP_FN = sum(self[value, pred_value] for pred_value in self._values) + if TP_FN == 0: + return 0.0 + return TP / TP_FN + + def precision(self, value): + """Given a value in the confusion matrix, return the precision + that corresponds to this value. The precision is defined as: + + - *p* = true positive / (true positive + false negative) + + and can loosely be considered the ratio of how often ``value`` + was predicted correctly relative to the number of predictions + for ``value``. + + :param value: value used in the ConfusionMatrix + :return: the precision corresponding to ``value``. + :rtype: float + """ + # Number of times `value` was correct, and also predicted + TP = self[value, value] + # Number of times `value` was predicted + TP_FP = sum(self[real_value, value] for real_value in self._values) + if TP_FP == 0: + return 0.0 + return TP / TP_FP + + def f_measure(self, value, alpha=0.5): + """ + Given a value used in the confusion matrix, return the f-measure + that corresponds to this value. The f-measure is the harmonic mean + of the ``precision`` and ``recall``, weighted by ``alpha``. + In particular, given the precision *p* and recall *r* defined by: + + - *p* = true positive / (true positive + false negative) + - *r* = true positive / (true positive + false positive) + + The f-measure is: + + - *1/(alpha/p + (1-alpha)/r)* + + With ``alpha = 0.5``, this reduces to: + + - *2pr / (p + r)* + + :param value: value used in the ConfusionMatrix + :param alpha: Ratio of the cost of false negative compared to false + positives. Defaults to 0.5, where the costs are equal. + :type alpha: float + :return: the F-measure corresponding to ``value``. + :rtype: float + """ + p = self.precision(value) + r = self.recall(value) + if p == 0.0 or r == 0.0: + return 0.0 + return 1.0 / (alpha / p + (1 - alpha) / r) + + def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False): + """ + Tabulate the **recall**, **precision** and **f-measure** + for each value in this confusion matrix. + + >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split() + >>> test = "DET VB VB DET NN NN NN IN DET NN".split() + >>> cm = ConfusionMatrix(reference, test) + >>> print(cm.evaluate()) + Tag | Prec. | Recall | F-measure + ----+--------+--------+----------- + DET | 1.0000 | 1.0000 | 1.0000 + IN | 1.0000 | 1.0000 | 1.0000 + JJ | 0.0000 | 0.0000 | 0.0000 + NN | 0.7500 | 0.7500 | 0.7500 + VB | 0.5000 | 1.0000 | 0.6667 + + + :param alpha: Ratio of the cost of false negative compared to false + positives, as used in the f-measure computation. Defaults to 0.5, + where the costs are equal. + :type alpha: float + :param truncate: If specified, then only show the specified + number of values. Any sorting (e.g., sort_by_count) + will be performed before truncation. Defaults to None + :type truncate: int, optional + :param sort_by_count: Whether to sort the outputs on frequency + in the reference label. Defaults to False. + :type sort_by_count: bool, optional + :return: A tabulated recall, precision and f-measure string + :rtype: str + """ + tags = self._values + + # Apply keyword parameters + if sort_by_count: + tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]])) + if truncate: + tags = tags[:truncate] + + tag_column_len = max(max(len(tag) for tag in tags), 3) + + # Construct the header + s = ( + f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" + f"{'-' * tag_column_len}-+--------+--------+-----------\n" + ) + + # Construct the body + for tag in tags: + s += ( + f"{tag:>{tag_column_len}} | " + f"{self.precision(tag):<6.4f} | " + f"{self.recall(tag):<6.4f} | " + f"{self.f_measure(tag, alpha=alpha):.4f}\n" + ) + + return s + + +def demo(): + reference = "DET NN VB DET JJ NN NN IN DET NN".split() + test = "DET VB VB DET NN NN NN IN DET NN".split() + print("Reference =", reference) + print("Test =", test) + print("Confusion matrix:") + print(ConfusionMatrix(reference, test)) + print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) + + print(ConfusionMatrix(reference, test).recall("VB")) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/distance.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/distance.py new file mode 100644 index 0000000000000000000000000000000000000000..6d187912a4a937c5036d6e06195da03f209cae9e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/distance.py @@ -0,0 +1,508 @@ +# Natural Language Toolkit: Distance Metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# Tom Lippincott +# URL: +# For license information, see LICENSE.TXT +# + +""" +Distance Metrics. + +Compute the distance between two items (usually strings). +As metrics, they must satisfy the following three requirements: + +1. d(a, a) = 0 +2. d(a, b) >= 0 +3. d(a, c) <= d(a, b) + d(b, c) +""" + +import operator +import warnings + + +def _edit_dist_init(len1, len2): + lev = [] + for i in range(len1): + lev.append([0] * len2) # initialize 2D array to zero + for i in range(len1): + lev[i][0] = i # column 0: 0,1,2,3,4,... + for j in range(len2): + lev[0][j] = j # row 0: 0,1,2,3,4,... + return lev + + +def _last_left_t_init(sigma): + return {c: 0 for c in sigma} + + +def _edit_dist_step( + lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False +): + c1 = s1[i - 1] + c2 = s2[j - 1] + + # skipping a character in s1 + a = lev[i - 1][j] + 1 + # skipping a character in s2 + b = lev[i][j - 1] + 1 + # substitution + c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0) + + # transposition + d = c + 1 # never picked by default + if transpositions and last_left > 0 and last_right > 0: + d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 + + # pick the cheapest + lev[i][j] = min(a, b, c, d) + + +def edit_distance(s1, s2, substitution_cost=1, transpositions=False): + """ + Calculate the Levenshtein edit-distance between two strings. + The edit distance is the number of characters that need to be + substituted, inserted, or deleted, to transform s1 into s2. For + example, transforming "rain" to "shine" requires three steps, + consisting of two substitutions and one insertion: + "rain" -> "sain" -> "shin" -> "shine". These operations could have + been done in other orders, but at least three steps are needed. + + Allows specifying the cost of substitution edits (e.g., "a" -> "b"), + because sometimes it makes sense to assign greater penalties to + substitutions. + + This also optionally allows transposition edits (e.g., "ab" -> "ba"), + though this is disabled by default. + + :param s1, s2: The strings to be analysed + :param transpositions: Whether to allow transposition edits + :type s1: str + :type s2: str + :type substitution_cost: int + :type transpositions: bool + :rtype: int + """ + # set up a 2-D array + len1 = len(s1) + len2 = len(s2) + lev = _edit_dist_init(len1 + 1, len2 + 1) + + # retrieve alphabet + sigma = set() + sigma.update(s1) + sigma.update(s2) + + # set up table to remember positions of last seen occurrence in s1 + last_left_t = _last_left_t_init(sigma) + + # iterate over the array + # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code + # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance + for i in range(1, len1 + 1): + last_right_buf = 0 + for j in range(1, len2 + 1): + last_left = last_left_t[s2[j - 1]] + last_right = last_right_buf + if s1[i - 1] == s2[j - 1]: + last_right_buf = j + _edit_dist_step( + lev, + i, + j, + s1, + s2, + last_left, + last_right, + substitution_cost=substitution_cost, + transpositions=transpositions, + ) + last_left_t[s1[i - 1]] = i + return lev[len1][len2] + + +def _edit_dist_backtrace(lev): + i, j = len(lev) - 1, len(lev[0]) - 1 + alignment = [(i, j)] + + while (i, j) != (0, 0): + directions = [ + (i - 1, j - 1), # substitution + (i - 1, j), # skip s1 + (i, j - 1), # skip s2 + ] + + direction_costs = ( + (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j)) + for i, j in directions + ) + _, (i, j) = min(direction_costs, key=operator.itemgetter(0)) + + alignment.append((i, j)) + return list(reversed(alignment)) + + +def edit_distance_align(s1, s2, substitution_cost=1): + """ + Calculate the minimum Levenshtein edit-distance based alignment + mapping between two strings. The alignment finds the mapping + from string s1 to s2 that minimizes the edit distance cost. + For example, mapping "rain" to "shine" would involve 2 + substitutions, 2 matches and an insertion resulting in + the following mapping: + [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)] + NB: (0, 0) is the start state without any letters associated + See more: https://web.stanford.edu/class/cs124/lec/med.pdf + + In case of multiple valid minimum-distance alignments, the + backtrace has the following operation precedence: + + 1. Substitute s1 and s2 characters + 2. Skip s1 character + 3. Skip s2 character + + The backtrace is carried out in reverse string order. + + This function does not support transposition. + + :param s1, s2: The strings to be aligned + :type s1: str + :type s2: str + :type substitution_cost: int + :rtype: List[Tuple(int, int)] + """ + # set up a 2-D array + len1 = len(s1) + len2 = len(s2) + lev = _edit_dist_init(len1 + 1, len2 + 1) + + # iterate over the array + for i in range(len1): + for j in range(len2): + _edit_dist_step( + lev, + i + 1, + j + 1, + s1, + s2, + 0, + 0, + substitution_cost=substitution_cost, + transpositions=False, + ) + + # backtrace to find alignment + alignment = _edit_dist_backtrace(lev) + return alignment + + +def binary_distance(label1, label2): + """Simple equality test. + + 0.0 if the labels are identical, 1.0 if they are different. + + >>> from nltk.metrics import binary_distance + >>> binary_distance(1,1) + 0.0 + + >>> binary_distance(1,3) + 1.0 + """ + + return 0.0 if label1 == label2 else 1.0 + + +def jaccard_distance(label1, label2): + """Distance metric comparing set-similarity.""" + return (len(label1.union(label2)) - len(label1.intersection(label2))) / len( + label1.union(label2) + ) + + +def masi_distance(label1, label2): + """Distance metric that takes into account partial agreement when multiple + labels are assigned. + + >>> from nltk.metrics import masi_distance + >>> masi_distance(set([1, 2]), set([1, 2, 3, 4])) + 0.665 + + Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI) + for Semantic and Pragmatic Annotation. + """ + + len_intersection = len(label1.intersection(label2)) + len_union = len(label1.union(label2)) + len_label1 = len(label1) + len_label2 = len(label2) + if len_label1 == len_label2 and len_label1 == len_intersection: + m = 1 + elif len_intersection == min(len_label1, len_label2): + m = 0.67 + elif len_intersection > 0: + m = 0.33 + else: + m = 0 + + return 1 - len_intersection / len_union * m + + +def interval_distance(label1, label2): + """Krippendorff's interval distance metric + + >>> from nltk.metrics import interval_distance + >>> interval_distance(1,10) + 81 + + Krippendorff 1980, Content Analysis: An Introduction to its Methodology + """ + + try: + return pow(label1 - label2, 2) + # return pow(list(label1)[0]-list(label2)[0],2) + except: + print("non-numeric labels not supported with interval distance") + + +def presence(label): + """Higher-order function to test presence of a given label""" + + return lambda x, y: 1.0 * ((label in x) == (label in y)) + + +def fractional_presence(label): + return ( + lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y) + or 0.0 * (label not in x and label not in y) + or abs(1.0 / len(x)) * (label in x and label not in y) + or (1.0 / len(y)) * (label not in x and label in y) + ) + + +def custom_distance(file): + data = {} + with open(file) as infile: + for l in infile: + labelA, labelB, dist = l.strip().split("\t") + labelA = frozenset([labelA]) + labelB = frozenset([labelB]) + data[frozenset([labelA, labelB])] = float(dist) + return lambda x, y: data[frozenset([x, y])] + + +def jaro_similarity(s1, s2): + """ + Computes the Jaro similarity between 2 sequences from: + + Matthew A. Jaro (1989). Advances in record linkage methodology + as applied to the 1985 census of Tampa Florida. Journal of the + American Statistical Association. 84 (406): 414-20. + + The Jaro distance between is the min no. of single-character transpositions + required to change one word into another. The Jaro similarity formula from + https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance : + + ``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)`` + + where + - `|s_i|` is the length of string `s_i` + - `m` is the no. of matching characters + - `t` is the half no. of possible transpositions. + """ + # First, store the length of the strings + # because they will be re-used several times. + len_s1, len_s2 = len(s1), len(s2) + + # The upper bound of the distance for being a matched character. + match_bound = max(len_s1, len_s2) // 2 - 1 + + # Initialize the counts for matches and transpositions. + matches = 0 # no.of matched characters in s1 and s2 + transpositions = 0 # no. of transpositions between s1 and s2 + flagged_1 = [] # positions in s1 which are matches to some character in s2 + flagged_2 = [] # positions in s2 which are matches to some character in s1 + + # Iterate through sequences, check for matches and compute transpositions. + for i in range(len_s1): # Iterate through each character. + upperbound = min(i + match_bound, len_s2 - 1) + lowerbound = max(0, i - match_bound) + for j in range(lowerbound, upperbound + 1): + if s1[i] == s2[j] and j not in flagged_2: + matches += 1 + flagged_1.append(i) + flagged_2.append(j) + break + flagged_2.sort() + for i, j in zip(flagged_1, flagged_2): + if s1[i] != s2[j]: + transpositions += 1 + + if matches == 0: + return 0 + else: + return ( + 1 + / 3 + * ( + matches / len_s1 + + matches / len_s2 + + (matches - transpositions // 2) / matches + ) + ) + + +def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): + """ + The Jaro Winkler distance is an extension of the Jaro similarity in: + + William E. Winkler. 1990. String Comparator Metrics and Enhanced + Decision Rules in the Fellegi-Sunter Model of Record Linkage. + Proceedings of the Section on Survey Research Methods. + American Statistical Association: 354-359. + + such that: + + jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) ) + + where, + + - jaro_sim is the output from the Jaro Similarity, + see jaro_similarity() + - l is the length of common prefix at the start of the string + - this implementation provides an upperbound for the l value + to keep the prefixes.A common value of this upperbound is 4. + - p is the constant scaling factor to overweigh common prefixes. + The Jaro-Winkler similarity will fall within the [0, 1] bound, + given that max(p)<=0.25 , default is p=0.1 in Winkler (1990) + + + Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf + from "Table 5 Comparison of String Comparators Rescaled between 0 and 1" + + >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"), + ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"), + ... ("dixon", "dickson"), ("billy", "susan")] + + >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000] + >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000] + + One way to match the values on the Winkler's paper is to provide a different + p scaling factor for different pairs of strings, e.g. + + >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1] + + >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): + ... assert round(jaro_similarity(s1, s2), 3) == jscore + ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore + + + Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from + "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names" + + >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'), + ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'), + ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'), + ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'), + ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'), + ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'), + ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'), + ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')] + + >>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926, + ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905, + ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000] + + >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926, + ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943, + ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000] + + One way to match the values on the Winkler's paper is to provide a different + p scaling factor for different pairs of strings, e.g. + + >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20, + ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] + + + >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): + ... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]: + ... continue # Skip bad examples from the paper. + ... assert round(jaro_similarity(s1, s2), 3) == jscore + ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore + + + + This test-case proves that the output of Jaro-Winkler similarity depends on + the product l * p and not on the product max_l * p. Here the product max_l * p > 1 + however the product l * p <= 1 + + >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3) + 0.88 + """ + # To ensure that the output of the Jaro-Winkler's similarity + # falls between [0,1], the product of l * p needs to be + # also fall between [0,1]. + if not 0 <= max_l * p <= 1: + warnings.warn( + str( + "The product `max_l * p` might not fall between [0,1]." + "Jaro-Winkler similarity might not be between 0 and 1." + ) + ) + + # Compute the Jaro similarity + jaro_sim = jaro_similarity(s1, s2) + + # Initialize the upper bound for the no. of prefixes. + # if user did not pre-define the upperbound, + # use shorter length between s1 and s2 + + # Compute the prefix matches. + l = 0 + # zip() will automatically loop until the end of shorter string. + for s1_i, s2_i in zip(s1, s2): + if s1_i == s2_i: + l += 1 + else: + break + if l == max_l: + break + # Return the similarity value as described in docstring. + return jaro_sim + (l * p * (1 - jaro_sim)) + + +def demo(): + string_distance_examples = [ + ("rain", "shine"), + ("abcdef", "acbdef"), + ("language", "lnaguaeg"), + ("language", "lnaugage"), + ("language", "lngauage"), + ] + for s1, s2 in string_distance_examples: + print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2)) + print( + f"Edit dist with transpositions btwn '{s1}' and '{s2}':", + edit_distance(s1, s2, transpositions=True), + ) + print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2)) + print( + f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':", + jaro_winkler_similarity(s1, s2), + ) + print( + f"Jaro-Winkler distance btwn '{s1}' and '{s2}':", + 1 - jaro_winkler_similarity(s1, s2), + ) + s1 = {1, 2, 3, 4} + s2 = {3, 4, 5} + print("s1:", s1) + print("s2:", s2) + print("Binary distance:", binary_distance(s1, s2)) + print("Jaccard distance:", jaccard_distance(s1, s2)) + print("MASI distance:", masi_distance(s1, s2)) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/paice.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/paice.py new file mode 100644 index 0000000000000000000000000000000000000000..24f3c6d14c17131b657dab6814418c8726774307 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/paice.py @@ -0,0 +1,389 @@ +# Natural Language Toolkit: Agreement Metrics +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Lauri Hallila +# URL: +# For license information, see LICENSE.TXT +# + +"""Counts Paice's performance statistics for evaluating stemming algorithms. + +What is required: + - A dictionary of words grouped by their real lemmas + - A dictionary of words grouped by stems from a stemming algorithm + +When these are given, Understemming Index (UI), Overstemming Index (OI), +Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted. + +References: +Chris D. Paice (1994). An evaluation method for stemming algorithms. +In Proceedings of SIGIR, 42--50. +""" + +from math import sqrt + + +def get_words_from_dictionary(lemmas): + """ + Get original set of words used for analysis. + + :param lemmas: A dictionary where keys are lemmas and values are sets + or lists of words corresponding to that lemma. + :type lemmas: dict(str): list(str) + :return: Set of words that exist as values in the dictionary + :rtype: set(str) + """ + words = set() + for lemma in lemmas: + words.update(set(lemmas[lemma])) + return words + + +def _truncate(words, cutlength): + """Group words by stems defined by truncating them at given length. + + :param words: Set of words used for analysis + :param cutlength: Words are stemmed by cutting at this length. + :type words: set(str) or list(str) + :type cutlength: int + :return: Dictionary where keys are stems and values are sets of words + corresponding to that stem. + :rtype: dict(str): set(str) + """ + stems = {} + for word in words: + stem = word[:cutlength] + try: + stems[stem].update([word]) + except KeyError: + stems[stem] = {word} + return stems + + +# Reference: https://en.wikipedia.org/wiki/Line-line_intersection +def _count_intersection(l1, l2): + """Count intersection between two line segments defined by coordinate pairs. + + :param l1: Tuple of two coordinate pairs defining the first line segment + :param l2: Tuple of two coordinate pairs defining the second line segment + :type l1: tuple(float, float) + :type l2: tuple(float, float) + :return: Coordinates of the intersection + :rtype: tuple(float, float) + """ + x1, y1 = l1[0] + x2, y2 = l1[1] + x3, y3 = l2[0] + x4, y4 = l2[1] + + denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4) + + if denominator == 0.0: # lines are parallel + if x1 == x2 == x3 == x4 == 0.0: + # When lines are parallel, they must be on the y-axis. + # We can ignore x-axis because we stop counting the + # truncation line when we get there. + # There are no other options as UI (x-axis) grows and + # OI (y-axis) diminishes when we go along the truncation line. + return (0.0, y4) + + x = ( + (x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4) + ) / denominator + y = ( + (x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4) + ) / denominator + return (x, y) + + +def _get_derivative(coordinates): + """Get derivative of the line from (0,0) to given coordinates. + + :param coordinates: A coordinate pair + :type coordinates: tuple(float, float) + :return: Derivative; inf if x is zero + :rtype: float + """ + try: + return coordinates[1] / coordinates[0] + except ZeroDivisionError: + return float("inf") + + +def _calculate_cut(lemmawords, stems): + """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words. + + :param lemmawords: Set or list of words corresponding to certain lemma. + :param stems: A dictionary where keys are stems and values are sets + or lists of words corresponding to that stem. + :type lemmawords: set(str) or list(str) + :type stems: dict(str): set(str) + :return: Amount of understemmed and overstemmed pairs contributed by words + existing in both lemmawords and stems. + :rtype: tuple(float, float) + """ + umt, wmt = 0.0, 0.0 + for stem in stems: + cut = set(lemmawords) & set(stems[stem]) + if cut: + cutcount = len(cut) + stemcount = len(stems[stem]) + # Unachieved merge total + umt += cutcount * (len(lemmawords) - cutcount) + # Wrongly merged total + wmt += cutcount * (stemcount - cutcount) + return (umt, wmt) + + +def _calculate(lemmas, stems): + """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs. + + :param lemmas: A dictionary where keys are lemmas and values are sets + or lists of words corresponding to that lemma. + :param stems: A dictionary where keys are stems and values are sets + or lists of words corresponding to that stem. + :type lemmas: dict(str): list(str) + :type stems: dict(str): set(str) + :return: Global unachieved merge total (gumt), + global desired merge total (gdmt), + global wrongly merged total (gwmt) and + global desired non-merge total (gdnt). + :rtype: tuple(float, float, float, float) + """ + + n = sum(len(lemmas[word]) for word in lemmas) + + gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0) + + for lemma in lemmas: + lemmacount = len(lemmas[lemma]) + + # Desired merge total + gdmt += lemmacount * (lemmacount - 1) + + # Desired non-merge total + gdnt += lemmacount * (n - lemmacount) + + # For each (lemma, stem) pair with common words, count how many + # pairs are understemmed and overstemmed. + umt, wmt = _calculate_cut(lemmas[lemma], stems) + + # Add to total undesired and wrongly-merged totals + gumt += umt + gwmt += wmt + + # Each object is counted twice, so divide by two + return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2) + + +def _indexes(gumt, gdmt, gwmt, gdnt): + """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). + + :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt), + global desired merge total (gdmt), + global wrongly merged total (gwmt) and + global desired non-merge total (gdnt). + :type gumt, gdmt, gwmt, gdnt: float + :return: Understemming Index (UI), + Overstemming Index (OI) and + Stemming Weight (SW). + :rtype: tuple(float, float, float) + """ + # Calculate Understemming Index (UI), + # Overstemming Index (OI) and Stemming Weight (SW) + try: + ui = gumt / gdmt + except ZeroDivisionError: + # If GDMT (max merge total) is 0, define UI as 0 + ui = 0.0 + try: + oi = gwmt / gdnt + except ZeroDivisionError: + # IF GDNT (max non-merge total) is 0, define OI as 0 + oi = 0.0 + try: + sw = oi / ui + except ZeroDivisionError: + if oi == 0.0: + # OI and UI are 0, define SW as 'not a number' + sw = float("nan") + else: + # UI is 0, define SW as infinity + sw = float("inf") + return (ui, oi, sw) + + +class Paice: + """Class for storing lemmas, stems and evaluation metrics.""" + + def __init__(self, lemmas, stems): + """ + :param lemmas: A dictionary where keys are lemmas and values are sets + or lists of words corresponding to that lemma. + :param stems: A dictionary where keys are stems and values are sets + or lists of words corresponding to that stem. + :type lemmas: dict(str): list(str) + :type stems: dict(str): set(str) + """ + self.lemmas = lemmas + self.stems = stems + self.coords = [] + self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None) + self.ui, self.oi, self.sw = (None, None, None) + self.errt = None + self.update() + + def __str__(self): + text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt] + text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt) + text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt) + text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt) + text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui) + text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi) + text.append("Stemming Weight (OI / UI): %s\n" % self.sw) + text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt) + coordinates = " ".join(["(%s, %s)" % item for item in self.coords]) + text.append("Truncation line: %s" % coordinates) + return "".join(text) + + def _get_truncation_indexes(self, words, cutlength): + """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'. + + :param words: Words used for the analysis + :param cutlength: Words are stemmed by cutting them at this length + :type words: set(str) or list(str) + :type cutlength: int + :return: Understemming and overstemming indexes + :rtype: tuple(int, int) + """ + + truncated = _truncate(words, cutlength) + gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated) + ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2] + return (ui, oi) + + def _get_truncation_coordinates(self, cutlength=0): + """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line. + + :param cutlength: Optional parameter to start counting from (ui, oi) + coordinates gotten by stemming at this length. Useful for speeding up + the calculations when you know the approximate location of the + intersection. + :type cutlength: int + :return: List of coordinate pairs that define the truncation line + :rtype: list(tuple(float, float)) + """ + words = get_words_from_dictionary(self.lemmas) + maxlength = max(len(word) for word in words) + + # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line + coords = [] + while cutlength <= maxlength: + # Get (UI, OI) pair of current truncation point + pair = self._get_truncation_indexes(words, cutlength) + + # Store only new coordinates so we'll have an actual + # line segment when counting the intersection point + if pair not in coords: + coords.append(pair) + if pair == (0.0, 0.0): + # Stop counting if truncation line goes through origo; + # length from origo to truncation line is 0 + return coords + if len(coords) >= 2 and pair[0] > 0.0: + derivative1 = _get_derivative(coords[-2]) + derivative2 = _get_derivative(coords[-1]) + # Derivative of the truncation line is a decreasing value; + # when it passes Stemming Weight, we've found the segment + # of truncation line intersecting with (0, 0) - (ui, oi) segment + if derivative1 >= self.sw >= derivative2: + return coords + cutlength += 1 + return coords + + def _errt(self): + """Count Error-Rate Relative to Truncation (ERRT). + + :return: ERRT, length of the line from origo to (UI, OI) divided by + the length of the line from origo to the point defined by the same + line when extended until the truncation line. + :rtype: float + """ + # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line + self.coords = self._get_truncation_coordinates() + if (0.0, 0.0) in self.coords: + # Truncation line goes through origo, so ERRT cannot be counted + if (self.ui, self.oi) != (0.0, 0.0): + return float("inf") + else: + return float("nan") + if (self.ui, self.oi) == (0.0, 0.0): + # (ui, oi) is origo; define errt as 0.0 + return 0.0 + # Count the intersection point + # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates + # so we have actual line segments instead of a line segment and a point + intersection = _count_intersection( + ((0, 0), (self.ui, self.oi)), self.coords[-2:] + ) + # Count OP (length of the line from origo to (ui, oi)) + op = sqrt(self.ui**2 + self.oi**2) + # Count OT (length of the line from origo to truncation line that goes through (ui, oi)) + ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2) + # OP / OT tells how well the stemming algorithm works compared to just truncating words + return op / ot + + def update(self): + """Update statistics after lemmas and stems have been set.""" + self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems) + self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt) + self.errt = self._errt() + + +def demo(): + """Demonstration of the module.""" + # Some words with their real lemmas + lemmas = { + "kneel": ["kneel", "knelt"], + "range": ["range", "ranged"], + "ring": ["ring", "rang", "rung"], + } + # Same words with stems from a stemming algorithm + stems = { + "kneel": ["kneel"], + "knelt": ["knelt"], + "rang": ["rang", "range", "ranged"], + "ring": ["ring"], + "rung": ["rung"], + } + print("Words grouped by their lemmas:") + for lemma in sorted(lemmas): + print("{} => {}".format(lemma, " ".join(lemmas[lemma]))) + print() + print("Same words grouped by a stemming algorithm:") + for stem in sorted(stems): + print("{} => {}".format(stem, " ".join(stems[stem]))) + print() + p = Paice(lemmas, stems) + print(p) + print() + # Let's "change" results from a stemming algorithm + stems = { + "kneel": ["kneel"], + "knelt": ["knelt"], + "rang": ["rang"], + "range": ["range", "ranged"], + "ring": ["ring"], + "rung": ["rung"], + } + print("Counting stats after changing stemming results:") + for stem in sorted(stems): + print("{} => {}".format(stem, " ".join(stems[stem]))) + print() + p.stems = stems + p.update() + print(p) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/scores.py b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/scores.py new file mode 100644 index 0000000000000000000000000000000000000000..6547519df099ebf530c7077dc646d1112104d67f --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/metrics/scores.py @@ -0,0 +1,228 @@ +# Natural Language Toolkit: Evaluation +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird +# URL: +# For license information, see LICENSE.TXT + +import operator +from functools import reduce +from math import fabs +from random import shuffle + +try: + from scipy.stats.stats import betai +except ImportError: + betai = None + +from nltk.util import LazyConcatenation, LazyMap + + +def accuracy(reference, test): + """ + Given a list of reference values and a corresponding list of test + values, return the fraction of corresponding values that are + equal. In particular, return the fraction of indices + ``0= actual_stat: + c += 1 + + if verbose and i % 10 == 0: + print("pseudo-statistic: %f" % pseudo_stat) + print("significance: %f" % ((c + 1) / (i + 1))) + print("-" * 60) + + significance = (c + 1) / (shuffles + 1) + + if verbose: + print("significance: %f" % significance) + if betai: + for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: + print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}") + + return (significance, c, shuffles) + + +def demo(): + print("-" * 75) + reference = "DET NN VB DET JJ NN NN IN DET NN".split() + test = "DET VB VB DET NN NN NN IN DET NN".split() + print("Reference =", reference) + print("Test =", test) + print("Accuracy:", accuracy(reference, test)) + + print("-" * 75) + reference_set = set(reference) + test_set = set(test) + print("Reference =", reference_set) + print("Test = ", test_set) + print("Precision:", precision(reference_set, test_set)) + print(" Recall:", recall(reference_set, test_set)) + print("F-Measure:", f_measure(reference_set, test_set)) + print("-" * 75) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49f7772dd2ef15cc3ec40c4aa92c02695b33b6a3 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/__init__.py @@ -0,0 +1,11 @@ +# Natural Language Toolkit: Miscellaneous modules +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from nltk.misc.babelfish import babelize_shell +from nltk.misc.chomsky import generate_chomsky +from nltk.misc.minimalset import MinimalSet +from nltk.misc.wordfinder import word_finder diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/babelfish.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/babelfish.py new file mode 100644 index 0000000000000000000000000000000000000000..d317d65a194578e28ffad94bd53803395b5e3c58 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/babelfish.py @@ -0,0 +1,10 @@ +""" +This module previously provided an interface to Babelfish online +translation service; this service is no longer available; this +module is kept in NLTK source code in order to provide better error +messages for people following the NLTK Book 2.0. +""" + + +def babelize_shell(): + print("Babelfish online translation service is no longer available.") diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/chomsky.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/chomsky.py new file mode 100644 index 0000000000000000000000000000000000000000..0632bca034512041b3e0cf9a6231f8ac1c131e4b --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/chomsky.py @@ -0,0 +1,134 @@ +# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13 +# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546 + +""" +CHOMSKY is an aid to writing linguistic papers in the style +of the great master. It is based on selected phrases taken +from actual books and articles written by Noam Chomsky. +Upon request, it assembles the phrases in the elegant +stylistic patterns that Chomsky is noted for. +To generate n sentences of linguistic wisdom, type + + (CHOMSKY n) -- for example + (CHOMSKY 5) generates half a screen of linguistic truth. +""" + +leadins = """To characterize a linguistic level L, + On the other hand, + This suggests that + It appears that + Furthermore, + We will bring evidence in favor of the following thesis: + To provide a constituent structure for T(Z,K), + From C1, it follows that + For any transformation which is sufficiently diversified in \ +application to be of any interest, + Analogously, + Clearly, + Note that + Of course, + Suppose, for instance, that + Thus + With this clarification, + Conversely, + We have already seen that + By combining adjunctions and certain deformations, + I suggested that these results would follow from the assumption that + If the position of the trace in (99c) were only relatively \ +inaccessible to movement, + However, this assumption is not correct, since + Comparing these examples with their parasitic gap counterparts in \ +(96) and (97), we see that + In the discussion of resumptive pronouns following (81), + So far, + Nevertheless, + For one thing, + Summarizing, then, we assume that + A consequence of the approach just outlined is that + Presumably, + On our assumptions, + It may be, then, that + It must be emphasized, once again, that + Let us continue to suppose that + Notice, incidentally, that """ +# List of LEADINs to buy time. + +subjects = """ the notion of level of grammaticalness + a case of semigrammaticalness of a different sort + most of the methodological work in modern linguistics + a subset of English sentences interesting on quite independent grounds + the natural general principle that will subsume this case + an important property of these three types of EC + any associated supporting element + the appearance of parasitic gaps in domains relatively inaccessible \ +to ordinary extraction + the speaker-hearer's linguistic intuition + the descriptive power of the base component + the earlier discussion of deviance + this analysis of a formative as a pair of sets of features + this selectionally introduced contextual feature + a descriptively adequate grammar + the fundamental error of regarding functional notions as categorial + relational information + the systematic use of complex symbols + the theory of syntactic features developed earlier""" +# List of SUBJECTs chosen for maximum professorial macho. + +verbs = """can be defined in such a way as to impose + delimits + suffices to account for + cannot be arbitrary in + is not subject to + does not readily tolerate + raises serious doubts about + is not quite equivalent to + does not affect the structure of + may remedy and, at the same time, eliminate + is not to be considered in determining + is to be regarded as + is unspecified with respect to + is, apparently, determined by + is necessary to impose an interpretation on + appears to correlate rather closely with + is rather different from""" +# List of VERBs chosen for autorecursive obfuscation. + +objects = """ problems of phonemic and morphological analysis. + a corpus of utterance tokens upon which conformity has been defined \ +by the paired utterance test. + the traditional practice of grammarians. + the levels of acceptability from fairly high (e.g. (99a)) to virtual \ +gibberish (e.g. (98d)). + a stipulation to place the constructions into these various categories. + a descriptive fact. + a parasitic gap construction. + the extended c-command discussed in connection with (34). + the ultimate standard that determines the accuracy of any proposed grammar. + the system of base rules exclusive of the lexicon. + irrelevant intervening contexts in selectional rules. + nondistinctness in the sense of distinctive feature theory. + a general convention regarding the forms of the grammar. + an abstract underlying order. + an important distinction in language use. + the requirement that branching is not tolerated within the dominance \ +scope of a complex symbol. + the strong generative capacity of the theory.""" +# List of OBJECTs selected for profound sententiousness. + +import random +import textwrap +from itertools import chain, islice + + +def generate_chomsky(times=5, line_length=72): + parts = [] + for part in (leadins, subjects, verbs, objects): + phraselist = list(map(str.strip, part.splitlines())) + random.shuffle(phraselist) + parts.append(phraselist) + output = chain.from_iterable(islice(zip(*parts), 0, times)) + print(textwrap.fill(" ".join(output), line_length)) + + +if __name__ == "__main__": + generate_chomsky() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/minimalset.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/minimalset.py new file mode 100644 index 0000000000000000000000000000000000000000..2fa93627401d689e8d942dd268ad14a52e8c6d25 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/minimalset.py @@ -0,0 +1,85 @@ +# Natural Language Toolkit: Minimal Sets +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +from collections import defaultdict + + +class MinimalSet: + """ + Find contexts where more than one possible target value can + appear. E.g. if targets are word-initial letters, and contexts + are the remainders of words, then we would like to find cases like + "fat" vs "cat", and "training" vs "draining". If targets are + parts-of-speech and contexts are words, then we would like to find + cases like wind (noun) 'air in rapid motion', vs wind (verb) + 'coil, wrap'. + """ + + def __init__(self, parameters=None): + """ + Create a new minimal set. + + :param parameters: The (context, target, display) tuples for the item + :type parameters: list(tuple(str, str, str)) + """ + self._targets = set() # the contrastive information + self._contexts = set() # what we are controlling for + self._seen = defaultdict(set) # to record what we have seen + self._displays = {} # what we will display + + if parameters: + for context, target, display in parameters: + self.add(context, target, display) + + def add(self, context, target, display): + """ + Add a new item to the minimal set, having the specified + context, target, and display form. + + :param context: The context in which the item of interest appears + :type context: str + :param target: The item of interest + :type target: str + :param display: The information to be reported for each item + :type display: str + """ + # Store the set of targets that occurred in this context + self._seen[context].add(target) + + # Keep track of which contexts and targets we have seen + self._contexts.add(context) + self._targets.add(target) + + # For a given context and target, store the display form + self._displays[(context, target)] = display + + def contexts(self, minimum=2): + """ + Determine which contexts occurred with enough distinct targets. + + :param minimum: the minimum number of distinct target forms + :type minimum: int + :rtype: list + """ + return [c for c in self._contexts if len(self._seen[c]) >= minimum] + + def display(self, context, target, default=""): + if (context, target) in self._displays: + return self._displays[(context, target)] + else: + return default + + def display_all(self, context): + result = [] + for target in self._targets: + x = self.display(context, target) + if x: + result.append(x) + return result + + def targets(self): + return self._targets diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/sort.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/sort.py new file mode 100644 index 0000000000000000000000000000000000000000..e9387bf24dd4ff744602578fdd692ae60eedfaac --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/sort.py @@ -0,0 +1,176 @@ +# Natural Language Toolkit: List Sorting +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +""" +This module provides a variety of list sorting algorithms, to +illustrate the many different algorithms (recipes) for solving a +problem, and how to analyze algorithms experimentally. +""" +# These algorithms are taken from: +# Levitin (2004) The Design and Analysis of Algorithms + +################################################################## +# Selection Sort +################################################################## + + +def selection(a): + """ + Selection Sort: scan the list to find its smallest element, then + swap it with the first element. The remainder of the list is one + element smaller; apply the same method to this list, and so on. + """ + count = 0 + + for i in range(len(a) - 1): + min = i + + for j in range(i + 1, len(a)): + if a[j] < a[min]: + min = j + + count += 1 + + a[min], a[i] = a[i], a[min] + + return count + + +################################################################## +# Bubble Sort +################################################################## + + +def bubble(a): + """ + Bubble Sort: compare adjacent elements of the list left-to-right, + and swap them if they are out of order. After one pass through + the list swapping adjacent items, the largest item will be in + the rightmost position. The remainder is one element smaller; + apply the same method to this list, and so on. + """ + count = 0 + for i in range(len(a) - 1): + for j in range(len(a) - i - 1): + if a[j + 1] < a[j]: + a[j], a[j + 1] = a[j + 1], a[j] + count += 1 + return count + + +################################################################## +# Merge Sort +################################################################## + + +def _merge_lists(b, c): + count = 0 + i = j = 0 + a = [] + while i < len(b) and j < len(c): + count += 1 + if b[i] <= c[j]: + a.append(b[i]) + i += 1 + else: + a.append(c[j]) + j += 1 + if i == len(b): + a += c[j:] + else: + a += b[i:] + return a, count + + +def merge(a): + """ + Merge Sort: split the list in half, and sort each half, then + combine the sorted halves. + """ + count = 0 + if len(a) > 1: + midpoint = len(a) // 2 + b = a[:midpoint] + c = a[midpoint:] + count_b = merge(b) + count_c = merge(c) + result, count_a = _merge_lists(b, c) + a[:] = result # copy the result back into a. + count = count_a + count_b + count_c + return count + + +################################################################## +# Quick Sort +################################################################## + + +def _partition(a, l, r): + p = a[l] + i = l + j = r + 1 + count = 0 + while True: + while i < r: + i += 1 + if a[i] >= p: + break + while j > l: + j -= 1 + if j < l or a[j] <= p: + break + a[i], a[j] = a[j], a[i] # swap + count += 1 + if i >= j: + break + a[i], a[j] = a[j], a[i] # undo last swap + a[l], a[j] = a[j], a[l] + return j, count + + +def _quick(a, l, r): + count = 0 + if l < r: + s, count = _partition(a, l, r) + count += _quick(a, l, s - 1) + count += _quick(a, s + 1, r) + return count + + +def quick(a): + return _quick(a, 0, len(a) - 1) + + +################################################################## +# Demonstration +################################################################## + + +def demo(): + from random import shuffle + + for size in (10, 20, 50, 100, 200, 500, 1000): + a = list(range(size)) + + # various sort methods + shuffle(a) + count_selection = selection(a) + shuffle(a) + count_bubble = bubble(a) + shuffle(a) + count_merge = merge(a) + shuffle(a) + count_quick = quick(a) + + print( + ("size=%5d: selection=%8d, bubble=%8d, " "merge=%6d, quick=%6d") + % (size, count_selection, count_bubble, count_merge, count_quick) + ) + + +if __name__ == "__main__": + demo() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/misc/wordfinder.py b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/wordfinder.py new file mode 100644 index 0000000000000000000000000000000000000000..eb3824c591022f216680760ba3c794b90f92afcb --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/misc/wordfinder.py @@ -0,0 +1,139 @@ +# Natural Language Toolkit: Word Finder +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# URL: +# For license information, see LICENSE.TXT + +# Simplified from PHP version by Robert Klein +# http://fswordfinder.sourceforge.net/ + +import random + + +# reverse a word with probability 0.5 +def revword(word): + if random.randint(1, 2) == 1: + return word[::-1] + return word + + +# try to insert word at position x,y; direction encoded in xf,yf +def step(word, x, xf, y, yf, grid): + for i in range(len(word)): + if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]: + return False + for i in range(len(word)): + grid[xf(i)][yf(i)] = word[i] + return True + + +# try to insert word at position x,y, in direction dir +def check(word, dir, x, y, grid, rows, cols): + if dir == 1: + if x - len(word) < 0 or y - len(word) < 0: + return False + return step(word, x, lambda i: x - i, y, lambda i: y - i, grid) + elif dir == 2: + if x - len(word) < 0: + return False + return step(word, x, lambda i: x - i, y, lambda i: y, grid) + elif dir == 3: + if x - len(word) < 0 or y + (len(word) - 1) >= cols: + return False + return step(word, x, lambda i: x - i, y, lambda i: y + i, grid) + elif dir == 4: + if y - len(word) < 0: + return False + return step(word, x, lambda i: x, y, lambda i: y - i, grid) + + +def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): + """ + Attempt to arrange words into a letter-grid with the specified + number of rows and columns. Try each word in several positions + and directions, until it can be fitted into the grid, or the + maximum number of allowable attempts is exceeded. Returns a tuple + consisting of the grid and the words that were successfully + placed. + + :param words: the list of words to be put into the grid + :type words: list + :param rows: the number of rows in the grid + :type rows: int + :param cols: the number of columns in the grid + :type cols: int + :param attempts: the number of times to attempt placing a word + :type attempts: int + :param alph: the alphabet, to be used for filling blank cells + :type alph: list + :rtype: tuple + """ + + # place longer words first + words = sorted(words, key=len, reverse=True) + + grid = [] # the letter grid + used = [] # the words we used + + # initialize the grid + for i in range(rows): + grid.append([""] * cols) + + # try to place each word + for word in words: + word = word.strip().upper() # normalize + save = word # keep a record of the word + word = revword(word) + for attempt in range(attempts): + r = random.randint(0, len(word)) + dir = random.choice([1, 2, 3, 4]) + x = random.randint(0, rows) + y = random.randint(0, cols) + if dir == 1: + x += r + y += r + elif dir == 2: + x += r + elif dir == 3: + x += r + y -= r + elif dir == 4: + y += r + if 0 <= x < rows and 0 <= y < cols: + if check(word, dir, x, y, grid, rows, cols): + # used.append((save, dir, x, y, word)) + used.append(save) + break + + # Fill up the remaining spaces + for i in range(rows): + for j in range(cols): + if grid[i][j] == "": + grid[i][j] = random.choice(alph) + + return grid, used + + +def word_finder(): + from nltk.corpus import words + + wordlist = words.words() + random.shuffle(wordlist) + wordlist = wordlist[:200] + wordlist = [w for w in wordlist if 3 <= len(w) <= 12] + grid, used = wordfinder(wordlist) + + print("Word Finder\n") + for i in range(len(grid)): + for j in range(len(grid[i])): + print(grid[i][j], end=" ") + print() + print() + + for i in range(len(used)): + print("%d:" % (i + 1), used[i]) + + +if __name__ == "__main__": + word_finder() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..706713f852d937e8a0b7475f8c47f24bb54f872a --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/__init__.py @@ -0,0 +1,102 @@ +# Natural Language Toolkit: Parsers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +# + +""" +NLTK Parsers + +Classes and interfaces for producing tree structures that represent +the internal organization of a text. This task is known as "parsing" +the text, and the resulting tree structures are called the text's +"parses". Typically, the text is a single sentence, and the tree +structure represents the syntactic structure of the sentence. +However, parsers can also be used in other domains. For example, +parsers can be used to derive the morphological structure of the +morphemes that make up a word, or to derive the discourse structure +for a set of utterances. + +Sometimes, a single piece of text can be represented by more than one +tree structure. Texts represented by more than one tree structure are +called "ambiguous" texts. Note that there are actually two ways in +which a text can be ambiguous: + + - The text has multiple correct parses. + - There is not enough information to decide which of several + candidate parses is correct. + +However, the parser module does *not* distinguish these two types of +ambiguity. + +The parser module defines ``ParserI``, a standard interface for parsing +texts; and two simple implementations of that interface, +``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains +three sub-modules for specialized kinds of parsing: + + - ``nltk.parser.chart`` defines chart parsing, which uses dynamic + programming to efficiently parse texts. + - ``nltk.parser.probabilistic`` defines probabilistic parsing, which + associates a probability with each parse. +""" + +from nltk.parse.api import ParserI +from nltk.parse.bllip import BllipParser +from nltk.parse.chart import ( + BottomUpChartParser, + BottomUpLeftCornerChartParser, + ChartParser, + LeftCornerChartParser, + SteppingChartParser, + TopDownChartParser, +) +from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser +from nltk.parse.dependencygraph import DependencyGraph +from nltk.parse.earleychart import ( + EarleyChartParser, + FeatureEarleyChartParser, + FeatureIncrementalBottomUpChartParser, + FeatureIncrementalBottomUpLeftCornerChartParser, + FeatureIncrementalChartParser, + FeatureIncrementalTopDownChartParser, + IncrementalBottomUpChartParser, + IncrementalBottomUpLeftCornerChartParser, + IncrementalChartParser, + IncrementalLeftCornerChartParser, + IncrementalTopDownChartParser, +) +from nltk.parse.evaluate import DependencyEvaluator +from nltk.parse.featurechart import ( + FeatureBottomUpChartParser, + FeatureBottomUpLeftCornerChartParser, + FeatureChartParser, + FeatureTopDownChartParser, +) +from nltk.parse.malt import MaltParser +from nltk.parse.nonprojectivedependencyparser import ( + NaiveBayesDependencyScorer, + NonprojectiveDependencyParser, + ProbabilisticNonprojectiveParser, +) +from nltk.parse.pchart import ( + BottomUpProbabilisticChartParser, + InsideChartParser, + LongestChartParser, + RandomChartParser, + UnsortedChartParser, +) +from nltk.parse.projectivedependencyparser import ( + ProbabilisticProjectiveDependencyParser, + ProjectiveDependencyParser, +) +from nltk.parse.recursivedescent import ( + RecursiveDescentParser, + SteppingRecursiveDescentParser, +) +from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser +from nltk.parse.transitionparser import TransitionParser +from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser +from nltk.parse.viterbi import ViterbiParser diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/api.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/api.py new file mode 100644 index 0000000000000000000000000000000000000000..bdf326420bc32665c794a169d6deed52bdd3eb2c --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/api.py @@ -0,0 +1,72 @@ +# Natural Language Toolkit: Parser API +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Edward Loper +# URL: +# For license information, see LICENSE.TXT +# + +import itertools + +from nltk.internals import overridden + + +class ParserI: + """ + A processing class for deriving trees that represent possible + structures for a sequence of tokens. These tree structures are + known as "parses". Typically, parsers are used to derive syntax + trees for sentences. But parsers can also be used to derive other + kinds of tree structure, such as morphological trees and discourse + structures. + + Subclasses must define: + - at least one of: ``parse()``, ``parse_sents()``. + + Subclasses may define: + - ``grammar()`` + """ + + def grammar(self): + """ + :return: The grammar used by this parser. + """ + raise NotImplementedError() + + def parse(self, sent, *args, **kwargs): + """ + :return: An iterator that generates parse trees for the sentence. + When possible this list is sorted from most likely to least likely. + + :param sent: The sentence to be parsed + :type sent: list(str) + :rtype: iter(Tree) + """ + if overridden(self.parse_sents): + return next(self.parse_sents([sent], *args, **kwargs)) + elif overridden(self.parse_one): + return ( + tree + for tree in [self.parse_one(sent, *args, **kwargs)] + if tree is not None + ) + elif overridden(self.parse_all): + return iter(self.parse_all(sent, *args, **kwargs)) + else: + raise NotImplementedError() + + def parse_sents(self, sents, *args, **kwargs): + """ + Apply ``self.parse()`` to each element of ``sents``. + :rtype: iter(iter(Tree)) + """ + return (self.parse(sent, *args, **kwargs) for sent in sents) + + def parse_all(self, sent, *args, **kwargs): + """:rtype: list(Tree)""" + return list(self.parse(sent, *args, **kwargs)) + + def parse_one(self, sent, *args, **kwargs): + """:rtype: Tree or None""" + return next(self.parse(sent, *args, **kwargs), None) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/bllip.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/bllip.py new file mode 100644 index 0000000000000000000000000000000000000000..5e479e94f7929a836dbd990106b69055dcd320f6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/bllip.py @@ -0,0 +1,299 @@ +# Natural Language Toolkit: Interface to BLLIP Parser +# +# Author: David McClosky +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +from nltk.parse.api import ParserI +from nltk.tree import Tree + +""" +Interface for parsing with BLLIP Parser. Requires the Python +bllipparser module. BllipParser objects can be constructed with the +``BllipParser.from_unified_model_dir`` class method or manually using the +``BllipParser`` constructor. The former is generally easier if you have +a BLLIP Parser unified model directory -- a basic model can be obtained +from NLTK's downloader. More unified parsing models can be obtained with +BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher`` +or see docs for ``bllipparser.ModelFetcher.download_and_install_model``). + +Basic usage:: + + # download and install a basic unified parsing model (Wall Street Journal) + # sudo python -m nltk.downloader bllip_wsj_no_aux + + >>> from nltk.data import find + >>> model_dir = find('models/bllip_wsj_no_aux').path + >>> bllip = BllipParser.from_unified_model_dir(model_dir) + + # 1-best parsing + >>> sentence1 = 'British left waffles on Falklands .'.split() + >>> top_parse = bllip.parse_one(sentence1) + >>> print(top_parse) + (S1 + (S + (NP (JJ British) (NN left)) + (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands)))) + (. .))) + + # n-best parsing + >>> sentence2 = 'Time flies'.split() + >>> all_parses = bllip.parse_all(sentence2) + >>> print(len(all_parses)) + 50 + >>> print(all_parses[0]) + (S1 (S (NP (NNP Time)) (VP (VBZ flies)))) + + # incorporating external tagging constraints (None means unconstrained tag) + >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')]) + >>> print(next(constrained1)) + (S1 (NP (VB Time) (NNS flies))) + >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)]) + >>> print(next(constrained2)) + (S1 (NP (NN Time) (VBZ flies))) + +References +---------- + +- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of + the 1st North American chapter of the Association for Computational + Linguistics conference. Association for Computational Linguistics, + 2000. + +- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing + and MaxEnt discriminative reranking." Proceedings of the 43rd Annual + Meeting on Association for Computational Linguistics. Association + for Computational Linguistics, 2005. + +Known issues +------------ + +Note that BLLIP Parser is not currently threadsafe. Since this module +uses a SWIG interface, it is potentially unsafe to create multiple +``BllipParser`` objects in the same process. BLLIP Parser currently +has issues with non-ASCII text and will raise an error if given any. + +See https://pypi.python.org/pypi/bllipparser/ for more information +on BLLIP Parser's Python interface. +""" + +__all__ = ["BllipParser"] + +# this block allows this module to be imported even if bllipparser isn't +# available +try: + from bllipparser import RerankingParser + from bllipparser.RerankingParser import get_unified_model_parameters + + def _ensure_bllip_import_or_error(): + pass + +except ImportError as ie: + + def _ensure_bllip_import_or_error(ie=ie): + raise ImportError("Couldn't import bllipparser module: %s" % ie) + + +def _ensure_ascii(words): + try: + for i, word in enumerate(words): + word.encode("ascii") + except UnicodeEncodeError as e: + raise ValueError( + f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser " + "currently doesn't support non-ASCII inputs." + ) from e + + +def _scored_parse_to_nltk_tree(scored_parse): + return Tree.fromstring(str(scored_parse.ptb_parse)) + + +class BllipParser(ParserI): + """ + Interface for parsing with BLLIP Parser. BllipParser objects can be + constructed with the ``BllipParser.from_unified_model_dir`` class + method or manually using the ``BllipParser`` constructor. + """ + + def __init__( + self, + parser_model=None, + reranker_features=None, + reranker_weights=None, + parser_options=None, + reranker_options=None, + ): + """ + Load a BLLIP Parser model from scratch. You'll typically want to + use the ``from_unified_model_dir()`` class method to construct + this object. + + :param parser_model: Path to parser model directory + :type parser_model: str + + :param reranker_features: Path the reranker model's features file + :type reranker_features: str + + :param reranker_weights: Path the reranker model's weights file + :type reranker_weights: str + + :param parser_options: optional dictionary of parser options, see + ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` + for more information. + :type parser_options: dict(str) + + :param reranker_options: optional + dictionary of reranker options, see + ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` + for more information. + :type reranker_options: dict(str) + """ + _ensure_bllip_import_or_error() + + parser_options = parser_options or {} + reranker_options = reranker_options or {} + + self.rrp = RerankingParser() + self.rrp.load_parser_model(parser_model, **parser_options) + if reranker_features and reranker_weights: + self.rrp.load_reranker_model( + features_filename=reranker_features, + weights_filename=reranker_weights, + **reranker_options, + ) + + def parse(self, sentence): + """ + Use BLLIP Parser to parse a sentence. Takes a sentence as a list + of words; it will be automatically tagged with this BLLIP Parser + instance's tagger. + + :return: An iterator that generates parse trees for the sentence + from most likely to least likely. + + :param sentence: The sentence to be parsed + :type sentence: list(str) + :rtype: iter(Tree) + """ + _ensure_ascii(sentence) + nbest_list = self.rrp.parse(sentence) + for scored_parse in nbest_list: + yield _scored_parse_to_nltk_tree(scored_parse) + + def tagged_parse(self, word_and_tag_pairs): + """ + Use BLLIP to parse a sentence. Takes a sentence as a list of + (word, tag) tuples; the sentence must have already been tokenized + and tagged. BLLIP will attempt to use the tags provided but may + use others if it can't come up with a complete parse subject + to those constraints. You may also specify a tag as ``None`` + to leave a token's tag unconstrained. + + :return: An iterator that generates parse trees for the sentence + from most likely to least likely. + + :param sentence: Input sentence to parse as (word, tag) pairs + :type sentence: list(tuple(str, str)) + :rtype: iter(Tree) + """ + words = [] + tag_map = {} + for i, (word, tag) in enumerate(word_and_tag_pairs): + words.append(word) + if tag is not None: + tag_map[i] = tag + + _ensure_ascii(words) + nbest_list = self.rrp.parse_tagged(words, tag_map) + for scored_parse in nbest_list: + yield _scored_parse_to_nltk_tree(scored_parse) + + @classmethod + def from_unified_model_dir( + cls, model_dir, parser_options=None, reranker_options=None + ): + """ + Create a ``BllipParser`` object from a unified parsing model + directory. Unified parsing model directories are a standardized + way of storing BLLIP parser and reranker models together on disk. + See ``bllipparser.RerankingParser.get_unified_model_parameters()`` + for more information about unified model directories. + + :return: A ``BllipParser`` object using the parser and reranker + models in the model directory. + + :param model_dir: Path to the unified model directory. + :type model_dir: str + :param parser_options: optional dictionary of parser options, see + ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` + for more information. + :type parser_options: dict(str) + :param reranker_options: optional dictionary of reranker options, see + ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` + for more information. + :type reranker_options: dict(str) + :rtype: BllipParser + """ + ( + parser_model_dir, + reranker_features_filename, + reranker_weights_filename, + ) = get_unified_model_parameters(model_dir) + return cls( + parser_model_dir, + reranker_features_filename, + reranker_weights_filename, + parser_options, + reranker_options, + ) + + +def demo(): + """This assumes the Python module bllipparser is installed.""" + + # download and install a basic unified parsing model (Wall Street Journal) + # sudo python -m nltk.downloader bllip_wsj_no_aux + + from nltk.data import find + + model_dir = find("models/bllip_wsj_no_aux").path + + print("Loading BLLIP Parsing models...") + # the easiest way to get started is to use a unified model + bllip = BllipParser.from_unified_model_dir(model_dir) + print("Done.") + + sentence1 = "British left waffles on Falklands .".split() + sentence2 = "I saw the man with the telescope .".split() + # this sentence is known to fail under the WSJ parsing model + fail1 = "# ! ? : -".split() + for sentence in (sentence1, sentence2, fail1): + print("Sentence: %r" % " ".join(sentence)) + try: + tree = next(bllip.parse(sentence)) + print(tree) + except StopIteration: + print("(parse failed)") + + # n-best parsing demo + for i, parse in enumerate(bllip.parse(sentence1)): + print("parse %d:\n%s" % (i, parse)) + + # using external POS tag constraints + print( + "forcing 'tree' to be 'NN':", + next(bllip.tagged_parse([("A", None), ("tree", "NN")])), + ) + print( + "forcing 'A' to be 'DT' and 'tree' to be 'NNP':", + next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])), + ) + # constraints don't have to make sense... (though on more complicated + # sentences, they may cause the parse to fail) + print( + "forcing 'A' to be 'NNP':", + next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])), + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/parse/corenlp.py b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/corenlp.py new file mode 100644 index 0000000000000000000000000000000000000000..b7f36be3c82a191bc259f3c8578bf822e1676a1e --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/parse/corenlp.py @@ -0,0 +1,800 @@ +# Natural Language Toolkit: Interface to the CoreNLP REST API. +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Dmitrijs Milajevs +# +# URL: +# For license information, see LICENSE.TXT + +import json +import os # required for doctests +import re +import socket +import time +from typing import List, Tuple + +from nltk.internals import _java_options, config_java, find_jar_iter, java +from nltk.parse.api import ParserI +from nltk.parse.dependencygraph import DependencyGraph +from nltk.tag.api import TaggerI +from nltk.tokenize.api import TokenizerI +from nltk.tree import Tree + +_stanford_url = "https://stanfordnlp.github.io/CoreNLP/" + + +class CoreNLPServerError(EnvironmentError): + """Exceptions associated with the Core NLP server.""" + + +def try_port(port=0): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.bind(("", port)) + + p = sock.getsockname()[1] + sock.close() + + return p + + +class CoreNLPServer: + + _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar" + _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar" + + def __init__( + self, + path_to_jar=None, + path_to_models_jar=None, + verbose=False, + java_options=None, + corenlp_options=None, + port=None, + ): + + if corenlp_options is None: + corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"] + + jars = list( + find_jar_iter( + self._JAR, + path_to_jar, + env_vars=("CORENLP",), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ) + ) + + # find the most recent code and model jar + stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name)) + + if port is None: + try: + port = try_port(9000) + except OSError: + port = try_port() + corenlp_options.extend(["-port", str(port)]) + else: + try_port(port) + corenlp_options.extend(["-port", str(port)]) + + self.url = f"http://localhost:{port}" + + model_jar = max( + find_jar_iter( + self._MODEL_JAR_PATTERN, + path_to_models_jar, + env_vars=("CORENLP_MODELS",), + searchpath=(), + url=_stanford_url, + verbose=verbose, + is_regex=True, + ), + key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name), + ) + + self.verbose = verbose + + self._classpath = stanford_jar, model_jar + + self.corenlp_options = corenlp_options + self.java_options = java_options or ["-mx2g"] + + def start(self, stdout="devnull", stderr="devnull"): + """Starts the CoreNLP server + + :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' + """ + import requests + + cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"] + + if self.corenlp_options: + cmd.extend(self.corenlp_options) + + # Configure java. + default_options = " ".join(_java_options) + config_java(options=self.java_options, verbose=self.verbose) + + try: + self.popen = java( + cmd, + classpath=self._classpath, + blocking=False, + stdout=stdout, + stderr=stderr, + ) + finally: + # Return java configurations to their default values. + config_java(options=default_options, verbose=self.verbose) + + # Check that the server is istill running. + returncode = self.popen.poll() + if returncode is not None: + _, stderrdata = self.popen.communicate() + raise CoreNLPServerError( + returncode, + "Could not start the server. " + "The error was: {}".format(stderrdata.decode("ascii")), + ) + + for i in range(30): + try: + response = requests.get(requests.compat.urljoin(self.url, "live")) + except requests.exceptions.ConnectionError: + time.sleep(1) + else: + if response.ok: + break + else: + raise CoreNLPServerError("Could not connect to the server.") + + for i in range(60): + try: + response = requests.get(requests.compat.urljoin(self.url, "ready")) + except requests.exceptions.ConnectionError: + time.sleep(1) + else: + if response.ok: + break + else: + raise CoreNLPServerError("The server is not ready.") + + def stop(self): + self.popen.terminate() + self.popen.wait() + + def __enter__(self): + self.start() + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + return False + + +class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI): + """Interface to the CoreNLP Parser.""" + + def __init__( + self, + url="http://localhost:9000", + encoding="utf8", + tagtype=None, + strict_json=True, + ): + import requests + + self.url = url + self.encoding = encoding + + if tagtype not in ["pos", "ner", None]: + raise ValueError("tagtype must be either 'pos', 'ner' or None") + + self.tagtype = tagtype + self.strict_json = strict_json + + self.session = requests.Session() + + def parse_sents(self, sentences, *args, **kwargs): + """Parse multiple sentences. + + Takes multiple sentences as a list where each sentence is a list of + words. Each sentence will be automatically tagged with this + CoreNLPParser instance's tagger. + + If a whitespace exists inside a token, then the token will be treated as + several tokens. + + :param sentences: Input sentences to parse + :type sentences: list(list(str)) + :rtype: iter(iter(Tree)) + """ + # Converting list(list(str)) -> list(str) + sentences = (" ".join(words) for words in sentences) + return self.raw_parse_sents(sentences, *args, **kwargs) + + def raw_parse(self, sentence, properties=None, *args, **kwargs): + """Parse a sentence. + + Takes a sentence as a string; before parsing, it will be automatically + tokenized and tagged by the CoreNLP Parser. + + :param sentence: Input sentence to parse + :type sentence: str + :rtype: iter(Tree) + """ + default_properties = {"tokenize.whitespace": "false"} + default_properties.update(properties or {}) + + return next( + self.raw_parse_sents( + [sentence], properties=default_properties, *args, **kwargs + ) + ) + + def api_call(self, data, properties=None, timeout=60): + default_properties = { + "outputFormat": "json", + "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format( + parser_annotator=self.parser_annotator + ), + } + + default_properties.update(properties or {}) + + response = self.session.post( + self.url, + params={"properties": json.dumps(default_properties)}, + data=data.encode(self.encoding), + headers={"Content-Type": f"text/plain; charset={self.encoding}"}, + timeout=timeout, + ) + + response.raise_for_status() + + return response.json(strict=self.strict_json) + + def raw_parse_sents( + self, sentences, verbose=False, properties=None, *args, **kwargs + ): + """Parse multiple sentences. + + Takes multiple sentences as a list of strings. Each sentence will be + automatically tokenized and tagged. + + :param sentences: Input sentences to parse. + :type sentences: list(str) + :rtype: iter(iter(Tree)) + + """ + default_properties = { + # Only splits on '\n', never inside the sentence. + "ssplit.eolonly": "true" + } + + default_properties.update(properties or {}) + + """ + for sentence in sentences: + parsed_data = self.api_call(sentence, properties=default_properties) + + assert len(parsed_data['sentences']) == 1 + + for parse in parsed_data['sentences']: + tree = self.make_tree(parse) + yield iter([tree]) + """ + parsed_data = self.api_call("\n".join(sentences), properties=default_properties) + for parsed_sent in parsed_data["sentences"]: + tree = self.make_tree(parsed_sent) + yield iter([tree]) + + def parse_text(self, text, *args, **kwargs): + """Parse a piece of text. + + The text might contain several sentences which will be split by CoreNLP. + + :param str text: text to be split. + :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? + + """ + parsed_data = self.api_call(text, *args, **kwargs) + + for parse in parsed_data["sentences"]: + yield self.make_tree(parse) + + def tokenize(self, text, properties=None): + """Tokenize a string of text. + + Skip these tests if CoreNLP is likely not ready. + >>> from nltk.test.setup_fixt import check_jar + >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) + + The CoreNLP server can be started using the following notation, although + we recommend the `with CoreNLPServer() as server:` context manager notation + to ensure that the server is always stopped. + >>> server = CoreNLPServer() + >>> server.start() + >>> parser = CoreNLPParser(url=server.url) + + >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' + >>> list(parser.tokenize(text)) + ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] + + >>> s = "The colour of the wall is blue." + >>> list( + ... parser.tokenize( + ... 'The colour of the wall is blue.', + ... properties={'tokenize.options': 'americanize=true'}, + ... ) + ... ) + ['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.'] + >>> server.stop() + + """ + default_properties = {"annotators": "tokenize,ssplit"} + + default_properties.update(properties or {}) + + result = self.api_call(text, properties=default_properties) + + for sentence in result["sentences"]: + for token in sentence["tokens"]: + yield token["originalText"] or token["word"] + + def tag_sents(self, sentences): + """ + Tag multiple sentences. + + Takes multiple sentences as a list where each sentence is a list of + tokens. + + :param sentences: Input sentences to tag + :type sentences: list(list(str)) + :rtype: list(list(tuple(str, str)) + """ + # Converting list(list(str)) -> list(str) + sentences = (" ".join(words) for words in sentences) + return [sentences[0] for sentences in self.raw_tag_sents(sentences)] + + def tag(self, sentence: str) -> List[Tuple[str, str]]: + """ + Tag a list of tokens. + + :rtype: list(tuple(str, str)) + + Skip these tests if CoreNLP is likely not ready. + >>> from nltk.test.setup_fixt import check_jar + >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) + + The CoreNLP server can be started using the following notation, although + we recommend the `with CoreNLPServer() as server:` context manager notation + to ensure that the server is always stopped. + >>> server = CoreNLPServer() + >>> server.start() + >>> parser = CoreNLPParser(url=server.url, tagtype='ner') + >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() + >>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE + [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), + ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')] + + >>> parser = CoreNLPParser(url=server.url, tagtype='pos') + >>> tokens = "What is the airspeed of an unladen swallow ?".split() + >>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE + [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), + ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), + ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] + >>> server.stop() + """ + return self.tag_sents([sentence])[0] + + def raw_tag_sents(self, sentences): + """ + Tag multiple sentences. + + Takes multiple sentences as a list where each sentence is a string. + + :param sentences: Input sentences to tag + :type sentences: list(str) + :rtype: list(list(list(tuple(str, str))) + """ + default_properties = { + "ssplit.isOneSentence": "true", + "annotators": "tokenize,ssplit,", + } + + # Supports only 'pos' or 'ner' tags. + assert self.tagtype in ["pos", "ner"] + default_properties["annotators"] += self.tagtype + for sentence in sentences: + tagged_data = self.api_call(sentence, properties=default_properties) + yield [ + [ + (token["word"], token[self.tagtype]) + for token in tagged_sentence["tokens"] + ] + for tagged_sentence in tagged_data["sentences"] + ] + + +class CoreNLPParser(GenericCoreNLPParser): + """ + Skip these tests if CoreNLP is likely not ready. + >>> from nltk.test.setup_fixt import check_jar + >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) + + The recommended usage of `CoreNLPParser` is using the context manager notation: + >>> with CoreNLPServer() as server: + ... parser = CoreNLPParser(url=server.url) + ... next( + ... parser.raw_parse('The quick brown fox jumps over the lazy dog.') + ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|__________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick brown fox jumps over the lazy dog . + + Alternatively, the server can be started using the following notation. + Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started + outside of Python. + >>> server = CoreNLPServer() + >>> server.start() + >>> parser = CoreNLPParser(url=server.url) + + >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( + ... [ + ... 'The quick brown fox jumps over the lazy dog.', + ... 'The quick grey wolf jumps over the lazy fox.', + ... ] + ... ) + + >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|__________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick brown fox jumps over the lazy dog . + + >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______________|__________________________ + | VP | + | _________|___ | + | | PP | + | | ________|___ | + NP | | NP | + ____|_________ | | _______|____ | + DT JJ JJ NN VBZ IN DT JJ NN . + | | | | | | | | | | + The quick grey wolf jumps over the lazy fox . + + >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + + >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _______|____ + | VP + | ________|___ + NP | NP + | | ___|___ + PRP VBP DT NN + | | | | + I 'm a dog + + >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + ____|___________ + | VP + | ___________|_____________ + | | NP + | | _______|________________________ + | | NP | | | + | | _____|_______ | | | + NP | NP | | NP | + | | ______|_________ | | ___|____ | + DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- + | | | | | | | | | | + This is my friends ' cat -LRB- the tabby -RRB- + + >>> parse_john, parse_mary, = parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|_____________ + | VP | + | ____|___ | + NP | NP | + | | | | + NNP VBZ NNP . + | | | | + John loves Mary . + + >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE + ROOT + | + S + _____|____ + NP VP | + | | | + NNP VBZ . + | | | + Mary walks . + + Special cases + + >>> next( + ... parser.raw_parse( + ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' + ... 'Jessica Lynch have angrily dismissed claims made in her biography ' + ... 'that she was raped by her Iraqi captors.' + ... ) + ... ).height() + 14 + + >>> next( + ... parser.raw_parse( + ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " + ... '0.05 percent, at 997.02.' + ... ) + ... ).height() + 11 + + >>> server.stop() + """ + + _OUTPUT_FORMAT = "penn" + parser_annotator = "parse" + + def make_tree(self, result): + return Tree.fromstring(result["parse"]) + + +class CoreNLPDependencyParser(GenericCoreNLPParser): + """Dependency parser. + + Skip these tests if CoreNLP is likely not ready. + >>> from nltk.test.setup_fixt import check_jar + >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) + + The recommended usage of `CoreNLPParser` is using the context manager notation: + >>> with CoreNLPServer() as server: + ... dep_parser = CoreNLPDependencyParser(url=server.url) + ... parse, = dep_parser.raw_parse( + ... 'The quick brown fox jumps over the lazy dog.' + ... ) + ... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 obl + . . 5 punct + + Alternatively, the server can be started using the following notation. + Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started + outside of Python. + >>> server = CoreNLPServer() + >>> server.start() + >>> dep_parser = CoreNLPDependencyParser(url=server.url) + >>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.') + >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE + (jumps (fox The quick brown) (dog over the lazy) .) + + >>> for governor, dep, dependent in parse.triples(): + ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE + ('jumps', 'VBZ') nsubj ('fox', 'NN') + ('fox', 'NN') det ('The', 'DT') + ('fox', 'NN') amod ('quick', 'JJ') + ('fox', 'NN') amod ('brown', 'JJ') + ('jumps', 'VBZ') obl ('dog', 'NN') + ('dog', 'NN') case ('over', 'IN') + ('dog', 'NN') det ('the', 'DT') + ('dog', 'NN') amod ('lazy', 'JJ') + ('jumps', 'VBZ') punct ('.', '.') + + >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( + ... [ + ... 'The quick brown fox jumps over the lazy dog.', + ... 'The quick grey wolf jumps over the lazy fox.', + ... ] + ... ) + >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + brown JJ 4 amod + fox NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + dog NN 5 obl + . . 5 punct + + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + The DT 4 det + quick JJ 4 amod + grey JJ 4 amod + wolf NN 5 nsubj + jumps VBZ 0 ROOT + over IN 9 case + the DT 9 det + lazy JJ 9 amod + fox NN 5 obl + . . 5 punct + + >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( + ... [ + ... "I 'm a dog".split(), + ... "This is my friends ' cat ( the tabby )".split(), + ... ] + ... ) + >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + I PRP 4 nsubj + 'm VBP 4 cop + a DT 4 det + dog NN 0 ROOT + + >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + This DT 6 nsubj + is VBZ 6 cop + my PRP$ 4 nmod:poss + friends NNS 6 nmod:poss + ' POS 4 case + cat NN 0 ROOT + ( -LRB- 9 punct + the DT 9 det + tabby NN 6 dep + ) -RRB- 9 punct + + >>> parse_john, parse_mary, = dep_parser.parse_text( + ... 'John loves Mary. Mary walks.' + ... ) + + >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + John NNP 2 nsubj + loves VBZ 0 ROOT + Mary NNP 2 obj + . . 2 punct + + >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE + Mary NNP 2 nsubj + walks VBZ 0 ROOT + . . 2 punct + + Special cases + + Non-breaking space inside of a token. + + >>> len( + ... next( + ... dep_parser.raw_parse( + ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' + ... 'serving, while it actually contains 2 1/2 servings.' + ... ) + ... ).nodes + ... ) + 23 + + Phone numbers. + + >>> len( + ... next( + ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') + ... ).nodes + ... ) + 10 + + >>> print( + ... next( + ... dep_parser.raw_parse('The underscore _ should not simply disappear.') + ... ).to_conll(4) + ... ) # doctest: +NORMALIZE_WHITESPACE + The DT 2 det + underscore NN 7 nsubj + _ NFP 7 punct + should MD 7 aux + not RB 7 advmod + simply RB 7 advmod + disappear VB 0 ROOT + . . 7 punct + + >>> print( + ... next( + ... dep_parser.raw_parse( + ... 'for all of its insights into the dream world of teen life , and its electronic expression through ' + ... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 ' + ... '1/2-hour running time .' + ... ) + ... ).to_conll(4) + ... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + for IN 2 case + all DT 24 obl + of IN 5 case + its PRP$ 5 nmod:poss + insights NNS 2 nmod + into IN 9 case + the DT 9 det + dream NN 9 compound + world NN 5 nmod + of IN 12 case + teen NN 12 compound + ... + + >>> server.stop() + """ + + _OUTPUT_FORMAT = "conll2007" + parser_annotator = "depparse" + + def make_tree(self, result): + + return DependencyGraph( + ( + " ".join(n_items[1:]) # NLTK expects an iterable of strings... + for n_items in sorted(transform(result)) + ), + cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token. + ) + + +def transform(sentence): + for dependency in sentence["basicDependencies"]: + + dependent_index = dependency["dependent"] + token = sentence["tokens"][dependent_index - 1] + + # Return values that we don't know as '_'. Also, consider tag and ctag + # to be equal. + yield ( + dependent_index, + "_", + token["word"], + token["lemma"], + token["pos"], + token["pos"], + "_", + str(dependency["governor"]), + dependency["dep"], + "_", + "_", + ) diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sem/lfg.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/lfg.py new file mode 100644 index 0000000000000000000000000000000000000000..40b50517d97138290ca42b24ef9fc3104f231de9 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sem/lfg.py @@ -0,0 +1,261 @@ +# Natural Language Toolkit: Lexical Functional Grammar +# +# Author: Dan Garrette +# +# Copyright (C) 2001-2022 NLTK Project +# URL: +# For license information, see LICENSE.TXT + +from itertools import chain + +from nltk.internals import Counter + + +class FStructure(dict): + def safeappend(self, key, item): + """ + Append 'item' to the list at 'key'. If no list exists for 'key', then + construct one. + """ + if key not in self: + self[key] = [] + self[key].append(item) + + def __setitem__(self, key, value): + dict.__setitem__(self, key.lower(), value) + + def __getitem__(self, key): + return dict.__getitem__(self, key.lower()) + + def __contains__(self, key): + return dict.__contains__(self, key.lower()) + + def to_glueformula_list(self, glue_dict): + depgraph = self.to_depgraph() + return glue_dict.to_glueformula_list(depgraph) + + def to_depgraph(self, rel=None): + from nltk.parse.dependencygraph import DependencyGraph + + depgraph = DependencyGraph() + nodes = depgraph.nodes + + self._to_depgraph(nodes, 0, "ROOT") + + # Add all the dependencies for all the nodes + for address, node in nodes.items(): + for n2 in (n for n in nodes.values() if n["rel"] != "TOP"): + if n2["head"] == address: + relation = n2["rel"] + node["deps"].setdefault(relation, []) + node["deps"][relation].append(n2["address"]) + + depgraph.root = nodes[1] + + return depgraph + + def _to_depgraph(self, nodes, head, rel): + index = len(nodes) + + nodes[index].update( + { + "address": index, + "word": self.pred[0], + "tag": self.pred[1], + "head": head, + "rel": rel, + } + ) + + for feature in sorted(self): + for item in sorted(self[feature]): + if isinstance(item, FStructure): + item._to_depgraph(nodes, index, feature) + elif isinstance(item, tuple): + new_index = len(nodes) + nodes[new_index].update( + { + "address": new_index, + "word": item[0], + "tag": item[1], + "head": index, + "rel": feature, + } + ) + elif isinstance(item, list): + for n in item: + n._to_depgraph(nodes, index, feature) + else: + raise Exception( + "feature %s is not an FStruct, a list, or a tuple" % feature + ) + + @staticmethod + def read_depgraph(depgraph): + return FStructure._read_depgraph(depgraph.root, depgraph) + + @staticmethod + def _read_depgraph(node, depgraph, label_counter=None, parent=None): + if not label_counter: + label_counter = Counter() + + if node["rel"].lower() in ["spec", "punct"]: + # the value of a 'spec' entry is a word, not an FStructure + return (node["word"], node["tag"]) + + else: + fstruct = FStructure() + fstruct.pred = None + fstruct.label = FStructure._make_label(label_counter.get()) + + fstruct.parent = parent + + word, tag = node["word"], node["tag"] + if tag[:2] == "VB": + if tag[2:3] == "D": + fstruct.safeappend("tense", ("PAST", "tense")) + fstruct.pred = (word, tag[:2]) + + if not fstruct.pred: + fstruct.pred = (word, tag) + + children = [ + depgraph.nodes[idx] + for idx in chain.from_iterable(node["deps"].values()) + ] + for child in children: + fstruct.safeappend( + child["rel"], + FStructure._read_depgraph(child, depgraph, label_counter, fstruct), + ) + + return fstruct + + @staticmethod + def _make_label(value): + """ + Pick an alphabetic character as identifier for an entity in the model. + + :param value: where to index into the list of characters + :type value: int + """ + letter = [ + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", + "a", + "b", + "c", + "d", + "e", + ][value - 1] + num = int(value) // 26 + if num > 0: + return letter + str(num) + else: + return letter + + def __repr__(self): + return self.__str__().replace("\n", "") + + def __str__(self): + return self.pretty_format() + + def pretty_format(self, indent=3): + try: + accum = "%s:[" % self.label + except NameError: + accum = "[" + try: + accum += "pred '%s'" % (self.pred[0]) + except NameError: + pass + + for feature in sorted(self): + for item in self[feature]: + if isinstance(item, FStructure): + next_indent = indent + len(feature) + 3 + len(self.label) + accum += "\n{}{} {}".format( + " " * (indent), + feature, + item.pretty_format(next_indent), + ) + elif isinstance(item, tuple): + accum += "\n{}{} '{}'".format(" " * (indent), feature, item[0]) + elif isinstance(item, list): + accum += "\n{}{} {{{}}}".format( + " " * (indent), + feature, + ("\n%s" % (" " * (indent + len(feature) + 2))).join(item), + ) + else: # ERROR + raise Exception( + "feature %s is not an FStruct, a list, or a tuple" % feature + ) + return accum + "]" + + +def demo_read_depgraph(): + from nltk.parse.dependencygraph import DependencyGraph + + dg1 = DependencyGraph( + """\ +Esso NNP 2 SUB +said VBD 0 ROOT +the DT 5 NMOD +Whiting NNP 5 NMOD +field NN 6 SUB +started VBD 2 VMOD +production NN 6 OBJ +Tuesday NNP 6 VMOD +""" + ) + dg2 = DependencyGraph( + """\ +John NNP 2 SUB +sees VBP 0 ROOT +Mary NNP 2 OBJ +""" + ) + dg3 = DependencyGraph( + """\ +a DT 2 SPEC +man NN 3 SUBJ +walks VB 0 ROOT +""" + ) + dg4 = DependencyGraph( + """\ +every DT 2 SPEC +girl NN 3 SUBJ +chases VB 0 ROOT +a DT 5 SPEC +dog NN 3 OBJ +""" + ) + + depgraphs = [dg1, dg2, dg3, dg4] + for dg in depgraphs: + print(FStructure.read_depgraph(dg)) + + +if __name__ == "__main__": + demo_read_depgraph() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3eaee5ef7a26a80cd721a7daf01c59abe05bcff --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/__init__.py @@ -0,0 +1,13 @@ +# Natural Language Toolkit: Sentiment Analysis +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Ewan Klein +# URL: +# For license information, see LICENSE.TXT + +""" +NLTK Sentiment Analysis Package + +""" +from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer +from nltk.sentiment.vader import SentimentIntensityAnalyzer diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/util.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8b4ad83765a44a550f15ebaa88ecb8c213b423 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/sentiment/util.py @@ -0,0 +1,887 @@ +# +# Natural Language Toolkit: Sentiment Analyzer +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Pierpaolo Pantone <24alsecondo@gmail.com> +# URL: +# For license information, see LICENSE.TXT + +""" +Utility methods for Sentiment Analysis. +""" + +import codecs +import csv +import json +import pickle +import random +import re +import sys +import time +from copy import deepcopy + +import nltk +from nltk.corpus import CategorizedPlaintextCorpusReader +from nltk.data import load +from nltk.tokenize.casual import EMOTICON_RE + +# //////////////////////////////////////////////////////////// +# { Regular expressions +# //////////////////////////////////////////////////////////// + +# Regular expression for negation by Christopher Potts +NEGATION = r""" + (?: + ^(?:never|no|nothing|nowhere|noone|none|not| + havent|hasnt|hadnt|cant|couldnt|shouldnt| + wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint + )$ + ) + | + n't""" + +NEGATION_RE = re.compile(NEGATION, re.VERBOSE) + +CLAUSE_PUNCT = r"^[.:;!?]$" +CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT) + +# Happy and sad emoticons + +HAPPY = { + ":-)", + ":)", + ";)", + ":o)", + ":]", + ":3", + ":c)", + ":>", + "=]", + "8)", + "=)", + ":}", + ":^)", + ":-D", + ":D", + "8-D", + "8D", + "x-D", + "xD", + "X-D", + "XD", + "=-D", + "=D", + "=-3", + "=3", + ":-))", + ":'-)", + ":')", + ":*", + ":^*", + ">:P", + ":-P", + ":P", + "X-P", + "x-p", + "xp", + "XP", + ":-p", + ":p", + "=p", + ":-b", + ":b", + ">:)", + ">;)", + ">:-)", + "<3", +} + +SAD = { + ":L", + ":-/", + ">:/", + ":S", + ">:[", + ":@", + ":-(", + ":[", + ":-||", + "=L", + ":<", + ":-[", + ":-<", + "=\\", + "=/", + ">:(", + ":(", + ">.<", + ":'-(", + ":'(", + ":\\", + ":-c", + ":c", + ":{", + ">:\\", + ";(", +} + + +def timer(method): + """ + A timer decorator to measure execution performance of methods. + """ + + def timed(*args, **kw): + start = time.time() + result = method(*args, **kw) + end = time.time() + tot_time = end - start + hours = tot_time // 3600 + mins = tot_time // 60 % 60 + # in Python 2.x round() will return a float, so we convert it to int + secs = int(round(tot_time % 60)) + if hours == 0 and mins == 0 and secs < 10: + print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds") + else: + print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s") + return result + + return timed + + +# //////////////////////////////////////////////////////////// +# { Feature extractor functions +# //////////////////////////////////////////////////////////// +""" +Feature extractor functions are declared outside the SentimentAnalyzer class. +Users should have the possibility to create their own feature extractors +without modifying SentimentAnalyzer. +""" + + +def extract_unigram_feats(document, unigrams, handle_negation=False): + """ + Populate a dictionary of unigram features, reflecting the presence/absence in + the document of each of the tokens in `unigrams`. + + :param document: a list of words/tokens. + :param unigrams: a list of words/tokens whose presence/absence has to be + checked in `document`. + :param handle_negation: if `handle_negation == True` apply `mark_negation` + method to `document` before checking for unigram presence/absence. + :return: a dictionary of unigram features {unigram : boolean}. + + >>> words = ['ice', 'police', 'riot'] + >>> document = 'ice is melting due to global warming'.split() + >>> sorted(extract_unigram_feats(document, words).items()) + [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] + """ + features = {} + if handle_negation: + document = mark_negation(document) + for word in unigrams: + features[f"contains({word})"] = word in set(document) + return features + + +def extract_bigram_feats(document, bigrams): + """ + Populate a dictionary of bigram features, reflecting the presence/absence in + the document of each of the tokens in `bigrams`. This extractor function only + considers contiguous bigrams obtained by `nltk.bigrams`. + + :param document: a list of words/tokens. + :param unigrams: a list of bigrams whose presence/absence has to be + checked in `document`. + :return: a dictionary of bigram features {bigram : boolean}. + + >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] + >>> document = 'ice is melting due to global warming'.split() + >>> sorted(extract_bigram_feats(document, bigrams).items()) # doctest: +NORMALIZE_WHITESPACE + [('contains(global - warming)', True), ('contains(love - you)', False), + ('contains(police - prevented)', False)] + """ + features = {} + for bigr in bigrams: + features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document) + return features + + +# //////////////////////////////////////////////////////////// +# { Helper Functions +# //////////////////////////////////////////////////////////// + + +def mark_negation(document, double_neg_flip=False, shallow=False): + """ + Append _NEG suffix to words that appear in the scope between a negation + and a punctuation mark. + + :param document: a list of words/tokens, or a tuple (words, label). + :param shallow: if True, the method will modify the original document in place. + :param double_neg_flip: if True, double negation is considered affirmation + (we activate/deactivate negation scope every time we find a negation). + :return: if `shallow == True` the method will modify the original document + and return it. If `shallow == False` the method will return a modified + document, leaving the original unmodified. + + >>> sent = "I didn't like this movie . It was bad .".split() + >>> mark_negation(sent) + ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] + """ + if not shallow: + document = deepcopy(document) + # check if the document is labeled. If so, do not consider the label. + labeled = document and isinstance(document[0], (tuple, list)) + if labeled: + doc = document[0] + else: + doc = document + neg_scope = False + for i, word in enumerate(doc): + if NEGATION_RE.search(word): + if not neg_scope or (neg_scope and double_neg_flip): + neg_scope = not neg_scope + continue + else: + doc[i] += "_NEG" + elif neg_scope and CLAUSE_PUNCT_RE.search(word): + neg_scope = not neg_scope + elif neg_scope and not CLAUSE_PUNCT_RE.search(word): + doc[i] += "_NEG" + + return document + + +def output_markdown(filename, **kwargs): + """ + Write the output of an analysis to a file. + """ + with codecs.open(filename, "at") as outfile: + text = "\n*** \n\n" + text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M")) + for k in sorted(kwargs): + if isinstance(kwargs[k], dict): + dictionary = kwargs[k] + text += f" - **{k}:**\n" + for entry in sorted(dictionary): + text += f" - {entry}: {dictionary[entry]} \n" + elif isinstance(kwargs[k], list): + text += f" - **{k}:**\n" + for entry in kwargs[k]: + text += f" - {entry}\n" + else: + text += f" - **{k}:** {kwargs[k]} \n" + outfile.write(text) + + +def split_train_test(all_instances, n=None): + """ + Randomly split `n` instances of the dataset into train and test sets. + + :param all_instances: a list of instances (e.g. documents) that will be split. + :param n: the number of instances to consider (in case we want to use only a + subset). + :return: two lists of instances. Train set is 8/10 of the total and test set + is 2/10 of the total. + """ + random.seed(12345) + random.shuffle(all_instances) + if not n or n > len(all_instances): + n = len(all_instances) + train_set = all_instances[: int(0.8 * n)] + test_set = all_instances[int(0.8 * n) : n] + + return train_set, test_set + + +def _show_plot(x_values, y_values, x_labels=None, y_labels=None): + try: + import matplotlib.pyplot as plt + except ImportError as e: + raise ImportError( + "The plot function requires matplotlib to be installed." + "See https://matplotlib.org/" + ) from e + + plt.locator_params(axis="y", nbins=3) + axes = plt.axes() + axes.yaxis.grid() + plt.plot(x_values, y_values, "ro", color="red") + plt.ylim(ymin=-1.2, ymax=1.2) + plt.tight_layout(pad=5) + if x_labels: + plt.xticks(x_values, x_labels, rotation="vertical") + if y_labels: + plt.yticks([-1, 0, 1], y_labels, rotation="horizontal") + # Pad margins so that markers are not clipped by the axes + plt.margins(0.2) + plt.show() + + +# //////////////////////////////////////////////////////////// +# { Parsing and conversion functions +# //////////////////////////////////////////////////////////// + + +def json2csv_preprocess( + json_file, + outfile, + fields, + encoding="utf8", + errors="replace", + gzip_compress=False, + skip_retweets=True, + skip_tongue_tweets=True, + skip_ambiguous_tweets=True, + strip_off_emoticons=True, + remove_duplicates=True, + limit=None, +): + """ + Convert json file to csv file, preprocessing each row to obtain a suitable + dataset for tweets Semantic Analysis. + + :param json_file: the original json file containing tweets. + :param outfile: the output csv filename. + :param fields: a list of fields that will be extracted from the json file and + kept in the output csv file. + :param encoding: the encoding of the files. + :param errors: the error handling strategy for the output writer. + :param gzip_compress: if True, create a compressed GZIP file. + + :param skip_retweets: if True, remove retweets. + :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" + emoticons. + :param skip_ambiguous_tweets: if True, remove tweets containing both happy + and sad emoticons. + :param strip_off_emoticons: if True, strip off emoticons from all tweets. + :param remove_duplicates: if True, remove tweets appearing more than once. + :param limit: an integer to set the number of tweets to convert. After the + limit is reached the conversion will stop. It can be useful to create + subsets of the original tweets json data. + """ + with codecs.open(json_file, encoding=encoding) as fp: + (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) + # write the list of fields as header + writer.writerow(fields) + + if remove_duplicates == True: + tweets_cache = [] + i = 0 + for line in fp: + tweet = json.loads(line) + row = extract_fields(tweet, fields) + try: + text = row[fields.index("text")] + # Remove retweets + if skip_retweets == True: + if re.search(r"\bRT\b", text): + continue + # Remove tweets containing ":P" and ":-P" emoticons + if skip_tongue_tweets == True: + if re.search(r"\:\-?P\b", text): + continue + # Remove tweets containing both happy and sad emoticons + if skip_ambiguous_tweets == True: + all_emoticons = EMOTICON_RE.findall(text) + if all_emoticons: + if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): + continue + # Strip off emoticons from all tweets + if strip_off_emoticons == True: + row[fields.index("text")] = re.sub( + r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text) + ) + # Remove duplicate tweets + if remove_duplicates == True: + if row[fields.index("text")] in tweets_cache: + continue + else: + tweets_cache.append(row[fields.index("text")]) + except ValueError: + pass + writer.writerow(row) + i += 1 + if limit and i >= limit: + break + outf.close() + + +def parse_tweets_set( + filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True +): + """ + Parse csv file containing tweets and output data a list of (text, label) tuples. + + :param filename: the input csv filename. + :param label: the label to be appended to each tweet contained in the csv file. + :param word_tokenizer: the tokenizer instance that will be used to tokenize + each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). + If no word_tokenizer is specified, tweets will not be tokenized. + :param sent_tokenizer: the tokenizer that will be used to split each tweet into + sentences. + :param skip_header: if True, skip the first line of the csv file (which usually + contains headers). + + :return: a list of (text, label) tuples. + """ + tweets = [] + if not sent_tokenizer: + sent_tokenizer = load("tokenizers/punkt/english.pickle") + + with codecs.open(filename, "rt") as csvfile: + reader = csv.reader(csvfile) + if skip_header == True: + next(reader, None) # skip the header + i = 0 + for tweet_id, text in reader: + # text = text[1] + i += 1 + sys.stdout.write(f"Loaded {i} tweets\r") + # Apply sentence and word tokenizer to text + if word_tokenizer: + tweet = [ + w + for sent in sent_tokenizer.tokenize(text) + for w in word_tokenizer.tokenize(sent) + ] + else: + tweet = text + tweets.append((tweet, label)) + + print(f"Loaded {i} tweets") + return tweets + + +# //////////////////////////////////////////////////////////// +# { Demos +# //////////////////////////////////////////////////////////// + + +def demo_tweets(trainer, n_instances=None, output=None): + """ + Train and test Naive Bayes classifier on 10000 tweets, tokenized using + TweetTokenizer. + Features are composed of: + + - 1000 most frequent unigrams + - 100 top bigrams (using BigramAssocMeasures.pmi) + + :param trainer: `train` method of a classifier. + :param n_instances: the number of total tweets that have to be used for + training and testing. Tweets will be equally split between positive and + negative. + :param output: the output file where results have to be reported. + """ + from nltk.corpus import stopwords, twitter_samples + from nltk.sentiment import SentimentAnalyzer + from nltk.tokenize import TweetTokenizer + + # Different customizations for the TweetTokenizer + tokenizer = TweetTokenizer(preserve_case=False) + # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) + # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) + + if n_instances is not None: + n_instances = int(n_instances / 2) + + fields = ["id", "text"] + positive_json = twitter_samples.abspath("positive_tweets.json") + positive_csv = "positive_tweets.csv" + json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) + + negative_json = twitter_samples.abspath("negative_tweets.json") + negative_csv = "negative_tweets.csv" + json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) + + neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer) + pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer) + + # We separately split subjective and objective instances to keep a balanced + # uniform class distribution in both train and test sets. + train_pos_docs, test_pos_docs = split_train_test(pos_docs) + train_neg_docs, test_neg_docs = split_train_test(neg_docs) + + training_tweets = train_pos_docs + train_neg_docs + testing_tweets = test_pos_docs + test_neg_docs + + sentim_analyzer = SentimentAnalyzer() + # stopwords = stopwords.words('english') + # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] + all_words = [word for word in sentim_analyzer.all_words(training_tweets)] + + # Add simple unigram word features + unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) + sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) + + # Add bigram collocation features + bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( + [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12 + ) + sentim_analyzer.add_feat_extractor( + extract_bigram_feats, bigrams=bigram_collocs_feats + ) + + training_set = sentim_analyzer.apply_features(training_tweets) + test_set = sentim_analyzer.apply_features(testing_tweets) + + classifier = sentim_analyzer.train(trainer, training_set) + # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) + try: + classifier.show_most_informative_features() + except AttributeError: + print( + "Your classifier does not provide a show_most_informative_features() method." + ) + results = sentim_analyzer.evaluate(test_set) + + if output: + extr = [f.__name__ for f in sentim_analyzer.feat_extractors] + output_markdown( + output, + Dataset="labeled_tweets", + Classifier=type(classifier).__name__, + Tokenizer=tokenizer.__class__.__name__, + Feats=extr, + Results=results, + Instances=n_instances, + ) + + +def demo_movie_reviews(trainer, n_instances=None, output=None): + """ + Train classifier on all instances of the Movie Reviews dataset. + The corpus has been preprocessed using the default sentence tokenizer and + WordPunctTokenizer. + Features are composed of: + + - most frequent unigrams + + :param trainer: `train` method of a classifier. + :param n_instances: the number of total reviews that have to be used for + training and testing. Reviews will be equally split between positive and + negative. + :param output: the output file where results have to be reported. + """ + from nltk.corpus import movie_reviews + from nltk.sentiment import SentimentAnalyzer + + if n_instances is not None: + n_instances = int(n_instances / 2) + + pos_docs = [ + (list(movie_reviews.words(pos_id)), "pos") + for pos_id in movie_reviews.fileids("pos")[:n_instances] + ] + neg_docs = [ + (list(movie_reviews.words(neg_id)), "neg") + for neg_id in movie_reviews.fileids("neg")[:n_instances] + ] + # We separately split positive and negative instances to keep a balanced + # uniform class distribution in both train and test sets. + train_pos_docs, test_pos_docs = split_train_test(pos_docs) + train_neg_docs, test_neg_docs = split_train_test(neg_docs) + + training_docs = train_pos_docs + train_neg_docs + testing_docs = test_pos_docs + test_neg_docs + + sentim_analyzer = SentimentAnalyzer() + all_words = sentim_analyzer.all_words(training_docs) + + # Add simple unigram word features + unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) + sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) + # Apply features to obtain a feature-value representation of our datasets + training_set = sentim_analyzer.apply_features(training_docs) + test_set = sentim_analyzer.apply_features(testing_docs) + + classifier = sentim_analyzer.train(trainer, training_set) + try: + classifier.show_most_informative_features() + except AttributeError: + print( + "Your classifier does not provide a show_most_informative_features() method." + ) + results = sentim_analyzer.evaluate(test_set) + + if output: + extr = [f.__name__ for f in sentim_analyzer.feat_extractors] + output_markdown( + output, + Dataset="Movie_reviews", + Classifier=type(classifier).__name__, + Tokenizer="WordPunctTokenizer", + Feats=extr, + Results=results, + Instances=n_instances, + ) + + +def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): + """ + Train and test a classifier on instances of the Subjective Dataset by Pang and + Lee. The dataset is made of 5000 subjective and 5000 objective sentences. + All tokens (words and punctuation marks) are separated by a whitespace, so + we use the basic WhitespaceTokenizer to parse the data. + + :param trainer: `train` method of a classifier. + :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. + :param n_instances: the number of total sentences that have to be used for + training and testing. Sentences will be equally split between positive + and negative. + :param output: the output file where results have to be reported. + """ + from nltk.corpus import subjectivity + from nltk.sentiment import SentimentAnalyzer + + if n_instances is not None: + n_instances = int(n_instances / 2) + + subj_docs = [ + (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances] + ] + obj_docs = [ + (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances] + ] + + # We separately split subjective and objective instances to keep a balanced + # uniform class distribution in both train and test sets. + train_subj_docs, test_subj_docs = split_train_test(subj_docs) + train_obj_docs, test_obj_docs = split_train_test(obj_docs) + + training_docs = train_subj_docs + train_obj_docs + testing_docs = test_subj_docs + test_obj_docs + + sentim_analyzer = SentimentAnalyzer() + all_words_neg = sentim_analyzer.all_words( + [mark_negation(doc) for doc in training_docs] + ) + + # Add simple unigram word features handling negation + unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) + sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) + + # Apply features to obtain a feature-value representation of our datasets + training_set = sentim_analyzer.apply_features(training_docs) + test_set = sentim_analyzer.apply_features(testing_docs) + + classifier = sentim_analyzer.train(trainer, training_set) + try: + classifier.show_most_informative_features() + except AttributeError: + print( + "Your classifier does not provide a show_most_informative_features() method." + ) + results = sentim_analyzer.evaluate(test_set) + + if save_analyzer == True: + sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle") + + if output: + extr = [f.__name__ for f in sentim_analyzer.feat_extractors] + output_markdown( + output, + Dataset="subjectivity", + Classifier=type(classifier).__name__, + Tokenizer="WhitespaceTokenizer", + Feats=extr, + Instances=n_instances, + Results=results, + ) + + return sentim_analyzer + + +def demo_sent_subjectivity(text): + """ + Classify a single sentence as subjective or objective using a stored + SentimentAnalyzer. + + :param text: a sentence whose subjectivity has to be classified. + """ + from nltk.classify import NaiveBayesClassifier + from nltk.tokenize import regexp + + word_tokenizer = regexp.WhitespaceTokenizer() + try: + sentim_analyzer = load("sa_subjectivity.pickle") + except LookupError: + print("Cannot find the sentiment analyzer you want to load.") + print("Training a new one using NaiveBayesClassifier.") + sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) + + # Tokenize and convert to lower case + tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] + print(sentim_analyzer.classify(tokenized_text)) + + +def demo_liu_hu_lexicon(sentence, plot=False): + """ + Basic example of sentiment classification using Liu and Hu opinion lexicon. + This function simply counts the number of positive, negative and neutral words + in the sentence and classifies it depending on which polarity is more represented. + Words that do not appear in the lexicon are considered as neutral. + + :param sentence: a sentence whose polarity has to be classified. + :param plot: if True, plot a visual representation of the sentence polarity. + """ + from nltk.corpus import opinion_lexicon + from nltk.tokenize import treebank + + tokenizer = treebank.TreebankWordTokenizer() + pos_words = 0 + neg_words = 0 + tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] + + x = list(range(len(tokenized_sent))) # x axis for the plot + y = [] + + for word in tokenized_sent: + if word in opinion_lexicon.positive(): + pos_words += 1 + y.append(1) # positive + elif word in opinion_lexicon.negative(): + neg_words += 1 + y.append(-1) # negative + else: + y.append(0) # neutral + + if pos_words > neg_words: + print("Positive") + elif pos_words < neg_words: + print("Negative") + elif pos_words == neg_words: + print("Neutral") + + if plot == True: + _show_plot( + x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"] + ) + + +def demo_vader_instance(text): + """ + Output polarity scores for a text using Vader approach. + + :param text: a text whose polarity has to be evaluated. + """ + from nltk.sentiment import SentimentIntensityAnalyzer + + vader_analyzer = SentimentIntensityAnalyzer() + print(vader_analyzer.polarity_scores(text)) + + +def demo_vader_tweets(n_instances=None, output=None): + """ + Classify 10000 positive and negative tweets using Vader approach. + + :param n_instances: the number of total tweets that have to be classified. + :param output: the output file where results have to be reported. + """ + from collections import defaultdict + + from nltk.corpus import twitter_samples + from nltk.metrics import accuracy as eval_accuracy + from nltk.metrics import f_measure as eval_f_measure + from nltk.metrics import precision as eval_precision + from nltk.metrics import recall as eval_recall + from nltk.sentiment import SentimentIntensityAnalyzer + + if n_instances is not None: + n_instances = int(n_instances / 2) + + fields = ["id", "text"] + positive_json = twitter_samples.abspath("positive_tweets.json") + positive_csv = "positive_tweets.csv" + json2csv_preprocess( + positive_json, + positive_csv, + fields, + strip_off_emoticons=False, + limit=n_instances, + ) + + negative_json = twitter_samples.abspath("negative_tweets.json") + negative_csv = "negative_tweets.csv" + json2csv_preprocess( + negative_json, + negative_csv, + fields, + strip_off_emoticons=False, + limit=n_instances, + ) + + pos_docs = parse_tweets_set(positive_csv, label="pos") + neg_docs = parse_tweets_set(negative_csv, label="neg") + + # We separately split subjective and objective instances to keep a balanced + # uniform class distribution in both train and test sets. + train_pos_docs, test_pos_docs = split_train_test(pos_docs) + train_neg_docs, test_neg_docs = split_train_test(neg_docs) + + training_tweets = train_pos_docs + train_neg_docs + testing_tweets = test_pos_docs + test_neg_docs + + vader_analyzer = SentimentIntensityAnalyzer() + + gold_results = defaultdict(set) + test_results = defaultdict(set) + acc_gold_results = [] + acc_test_results = [] + labels = set() + num = 0 + for i, (text, label) in enumerate(testing_tweets): + labels.add(label) + gold_results[label].add(i) + acc_gold_results.append(label) + score = vader_analyzer.polarity_scores(text)["compound"] + if score > 0: + observed = "pos" + else: + observed = "neg" + num += 1 + acc_test_results.append(observed) + test_results[observed].add(i) + metrics_results = {} + for label in labels: + accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) + metrics_results["Accuracy"] = accuracy_score + precision_score = eval_precision(gold_results[label], test_results[label]) + metrics_results[f"Precision [{label}]"] = precision_score + recall_score = eval_recall(gold_results[label], test_results[label]) + metrics_results[f"Recall [{label}]"] = recall_score + f_measure_score = eval_f_measure(gold_results[label], test_results[label]) + metrics_results[f"F-measure [{label}]"] = f_measure_score + + for result in sorted(metrics_results): + print(f"{result}: {metrics_results[result]}") + + if output: + output_markdown( + output, + Approach="Vader", + Dataset="labeled_tweets", + Instances=n_instances, + Results=metrics_results, + ) + + +if __name__ == "__main__": + from sklearn.svm import LinearSVC + + from nltk.classify import MaxentClassifier, NaiveBayesClassifier + from nltk.classify.scikitlearn import SklearnClassifier + from nltk.twitter.common import _outf_writer, extract_fields + + naive_bayes = NaiveBayesClassifier.train + svm = SklearnClassifier(LinearSVC()).train + maxent = MaxentClassifier.train + + demo_tweets(naive_bayes) + # demo_movie_reviews(svm) + # demo_subjectivity(svm) + # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ") + # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True) + # demo_vader_instance("This movie was actually neither that funny, nor super witty.") + # demo_vader_tweets() diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/tag/__init__.py b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5a01c20d4732cd9f7f2e0c92759b96a0e0ba07d6 --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/tag/__init__.py @@ -0,0 +1,184 @@ +# Natural Language Toolkit: Taggers +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Edward Loper +# Steven Bird (minor additions) +# URL: +# For license information, see LICENSE.TXT +""" +NLTK Taggers + +This package contains classes and interfaces for part-of-speech +tagging, or simply "tagging". + +A "tag" is a case-sensitive string that specifies some property of a token, +such as its part of speech. Tagged tokens are encoded as tuples +``(tag, token)``. For example, the following tagged token combines +the word ``'fly'`` with a noun part of speech tag (``'NN'``): + + >>> tagged_tok = ('fly', 'NN') + +An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: + + >>> from nltk import pos_tag, word_tokenize + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE + [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), + ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] + +A Russian tagger is also available if you specify lang="rus". It uses +the Russian National Corpus tagset: + + >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP + [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'), + ('бумажку', 'S'), ('.', 'NONLEX')] + +This package defines several taggers, which take a list of tokens, +assign a tag to each one, and return the resulting list of tagged tokens. +Most of the taggers are built automatically based on a training corpus. +For example, the unigram tagger tags each word *w* by checking what +the most frequent tag for *w* was in a training corpus: + + >>> from nltk.corpus import brown + >>> from nltk.tag import UnigramTagger + >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) + >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] + >>> for word, tag in tagger.tag(sent): + ... print(word, '->', tag) + Mitchell -> NP + decried -> None + the -> AT + high -> JJ + rate -> NN + of -> IN + unemployment -> None + +Note that words that the tagger has not seen during training receive a tag +of ``None``. + +We evaluate a tagger on data that was not seen during training: + + >>> round(tagger.accuracy(brown.tagged_sents(categories='news')[500:600]), 3) + 0.735 + +For more information, please consult chapter 5 of the NLTK Book. + +isort:skip_file +""" + +from nltk.tag.api import TaggerI +from nltk.tag.util import str2tuple, tuple2str, untag +from nltk.tag.sequential import ( + SequentialBackoffTagger, + ContextTagger, + DefaultTagger, + NgramTagger, + UnigramTagger, + BigramTagger, + TrigramTagger, + AffixTagger, + RegexpTagger, + ClassifierBasedTagger, + ClassifierBasedPOSTagger, +) +from nltk.tag.brill import BrillTagger +from nltk.tag.brill_trainer import BrillTaggerTrainer +from nltk.tag.tnt import TnT +from nltk.tag.hunpos import HunposTagger +from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger +from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer +from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger +from nltk.tag.mapping import tagset_mapping, map_tag +from nltk.tag.crf import CRFTagger +from nltk.tag.perceptron import PerceptronTagger + +from nltk.data import load, find + +RUS_PICKLE = ( + "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle" +) + + +def _get_tagger(lang=None): + if lang == "rus": + tagger = PerceptronTagger(False) + ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) + tagger.load(ap_russian_model_loc) + else: + tagger = PerceptronTagger() + return tagger + + +def _pos_tag(tokens, tagset=None, tagger=None, lang=None): + # Currently only supports English and Russian. + if lang not in ["eng", "rus"]: + raise NotImplementedError( + "Currently, NLTK pos_tag only supports English and Russian " + "(i.e. lang='eng' or lang='rus')" + ) + # Throws Error if tokens is of string type + elif isinstance(tokens, str): + raise TypeError("tokens: expected a list of strings, got a string") + + else: + tagged_tokens = tagger.tag(tokens) + if tagset: # Maps to the specified tagset. + if lang == "eng": + tagged_tokens = [ + (token, map_tag("en-ptb", tagset, tag)) + for (token, tag) in tagged_tokens + ] + elif lang == "rus": + # Note that the new Russian pos tags from the model contains suffixes, + # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 + tagged_tokens = [ + (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0])) + for (token, tag) in tagged_tokens + ] + return tagged_tokens + + +def pos_tag(tokens, tagset=None, lang="eng"): + """ + Use NLTK's currently recommended part of speech tagger to + tag the given list of tokens. + + >>> from nltk.tag import pos_tag + >>> from nltk.tokenize import word_tokenize + >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE + [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), + ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] + >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE + [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), + ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] + + NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. + + :param tokens: Sequence of tokens to be tagged + :type tokens: list(str) + :param tagset: the tagset to be used, e.g. universal, wsj, brown + :type tagset: str + :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian + :type lang: str + :return: The tagged tokens + :rtype: list(tuple(str, str)) + """ + tagger = _get_tagger(lang) + return _pos_tag(tokens, tagset, tagger, lang) + + +def pos_tag_sents(sentences, tagset=None, lang="eng"): + """ + Use NLTK's currently recommended part of speech tagger to tag the + given list of sentences, each consisting of a list of tokens. + + :param sentences: List of sentences to be tagged + :type sentences: list(list(str)) + :param tagset: the tagset to be used, e.g. universal, wsj, brown + :type tagset: str + :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian + :type lang: str + :return: The list of tagged sentences + :rtype: list(list(tuple(str, str))) + """ + tagger = _get_tagger(lang) + return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences] diff --git a/.eggs/nltk-3.8-py3.10.egg/nltk/util.py b/.eggs/nltk-3.8-py3.10.egg/nltk/util.py new file mode 100644 index 0000000000000000000000000000000000000000..ba6812ac3f4ea4a82c1321de59e3a94f2f1119fa --- /dev/null +++ b/.eggs/nltk-3.8-py3.10.egg/nltk/util.py @@ -0,0 +1,1216 @@ +# Natural Language Toolkit: Utility functions +# +# Copyright (C) 2001-2022 NLTK Project +# Author: Steven Bird +# Eric Kafe (acyclic closures) +# URL: +# For license information, see LICENSE.TXT + +import inspect +import locale +import os +import pydoc +import re +import textwrap +import warnings +from collections import defaultdict, deque +from itertools import chain, combinations, islice, tee +from pprint import pprint +from urllib.request import ( + HTTPPasswordMgrWithDefaultRealm, + ProxyBasicAuthHandler, + ProxyDigestAuthHandler, + ProxyHandler, + build_opener, + getproxies, + install_opener, +) + +from nltk.collections import * +from nltk.internals import deprecated, raise_unorderable_types, slice_bounds + +###################################################################### +# Short usage message +###################################################################### + + +@deprecated("Use help(obj) instead.") +def usage(obj): + str(obj) # In case it's lazy, this will load it. + + if not isinstance(obj, type): + obj = obj.__class__ + + print(f"{obj.__name__} supports the following operations:") + for (name, method) in sorted(pydoc.allmethods(obj).items()): + if name.startswith("_"): + continue + if getattr(method, "__deprecated__", False): + continue + + try: + sig = str(inspect.signature(method)) + except ValueError as e: + # builtins sometimes don't support introspection + if "builtin" in str(e): + continue + else: + raise + + args = sig.lstrip("(").rstrip(")").split(", ") + meth = inspect.getattr_static(obj, name) + if isinstance(meth, (classmethod, staticmethod)): + name = f"cls.{name}" + elif args and args[0] == "self": + name = f"self.{name}" + args.pop(0) + print( + textwrap.fill( + f"{name}({', '.join(args)})", + initial_indent=" - ", + subsequent_indent=" " * (len(name) + 5), + ) + ) + + +########################################################################## +# IDLE +########################################################################## + + +def in_idle(): + """ + Return True if this function is run within idle. Tkinter + programs that are run in idle should never call ``Tk.mainloop``; so + this function should be used to gate all calls to ``Tk.mainloop``. + + :warning: This function works by checking ``sys.stdin``. If the + user has modified ``sys.stdin``, then it may return incorrect + results. + :rtype: bool + """ + import sys + + return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy") + + +########################################################################## +# PRETTY PRINTING +########################################################################## + + +def pr(data, start=0, end=None): + """ + Pretty print a sequence of data items + + :param data: the data stream to print + :type data: sequence or iter + :param start: the start position + :type start: int + :param end: the end position + :type end: int + """ + pprint(list(islice(data, start, end))) + + +def print_string(s, width=70): + """ + Pretty print a string, breaking lines on whitespace + + :param s: the string to print, consisting of words and spaces + :type s: str + :param width: the display width + :type width: int + """ + print("\n".join(textwrap.wrap(s, width=width))) + + +def tokenwrap(tokens, separator=" ", width=70): + """ + Pretty print a list of text tokens, breaking lines on whitespace + + :param tokens: the tokens to print + :type tokens: list + :param separator: the string to use to separate tokens + :type separator: str + :param width: the display width (default=70) + :type width: int + """ + return "\n".join(textwrap.wrap(separator.join(tokens), width=width)) + + +########################################################################## +# Indexing +########################################################################## + + +class Index(defaultdict): + def __init__(self, pairs): + defaultdict.__init__(self, list) + for key, value in pairs: + self[key].append(value) + + +###################################################################### +## Regexp display (thanks to David Mertz) +###################################################################### + + +def re_show(regexp, string, left="{", right="}"): + """ + Return a string with markers surrounding the matched substrings. + Search str for substrings matching ``regexp`` and wrap the matches + with braces. This is convenient for learning about regular expressions. + + :param regexp: The regular expression. + :type regexp: str + :param string: The string being matched. + :type string: str + :param left: The left delimiter (printed before the matched substring) + :type left: str + :param right: The right delimiter (printed after the matched substring) + :type right: str + :rtype: str + """ + print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())) + + +########################################################################## +# READ FROM FILE OR STRING +########################################################################## + +# recipe from David Mertz +def filestring(f): + if hasattr(f, "read"): + return f.read() + elif isinstance(f, str): + with open(f) as infile: + return infile.read() + else: + raise ValueError("Must be called with a filename or file-like object") + + +########################################################################## +# Breadth-First Search +########################################################################## + + +def breadth_first(tree, children=iter, maxdepth=-1): + """Traverse the nodes of a tree in breadth-first order. + (No check for cycles.) + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + """ + queue = deque([(tree, 0)]) + + while queue: + node, depth = queue.popleft() + yield node + + if depth != maxdepth: + try: + queue.extend((c, depth + 1) for c in children(node)) + except TypeError: + pass + + +########################################################################## +# Graph Drawing +########################################################################## + + +def edge_closure(tree, children=iter, maxdepth=-1, verbose=False): + """Yield the edges of a graph in breadth-first order, + discarding eventual cycles. + The first argument should be the start node; + children should be a function taking as argument a graph node + and returning an iterator of the node's children. + + >>> from nltk.util import edge_closure + >>> print(list(edge_closure('A', lambda node:{'A':['B','C'], 'B':'C', 'C':'B'}[node]))) + [('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')] + """ + traversed = set() + edges = set() + queue = deque([(tree, 0)]) + while queue: + node, depth = queue.popleft() + traversed.add(node) + if depth != maxdepth: + try: + for child in children(node): + if child not in traversed: + queue.append((child, depth + 1)) + else: + if verbose: + warnings.warn( + f"Discarded redundant search for {child} at depth {depth + 1}", + stacklevel=2, + ) + edge = (node, child) + if edge not in edges: + yield edge + edges.add(edge) + except TypeError: + pass + + +def edges2dot(edges, shapes=None, attr=None): + """ + :param edges: the set (or list) of edges of a directed graph. + + :return dot_string: a representation of 'edges' as a string in the DOT + graph language, which can be converted to an image by the 'dot' program + from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). + + :param shapes: dictionary of strings that trigger a specified shape. + :param attr: dictionary with global graph attributes + + >>> import nltk + >>> from nltk.util import edges2dot + >>> print(edges2dot([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')])) + digraph G { + "A" -> "B"; + "A" -> "C"; + "B" -> "C"; + "C" -> "B"; + } + + """ + if not shapes: + shapes = dict() + if not attr: + attr = dict() + + dot_string = "digraph G {\n" + + for pair in attr.items(): + dot_string += f"{pair[0]} = {pair[1]};\n" + + for edge in edges: + for shape in shapes.items(): + for node in range(2): + if shape[0] in repr(edge[node]): + dot_string += f'"{edge[node]}" [shape = {shape[1]}];\n' + dot_string += f'"{edge[0]}" -> "{edge[1]}";\n' + + dot_string += "}\n" + return dot_string + + +def unweighted_minimum_spanning_digraph(tree, children=iter, shapes=None, attr=None): + """ + + Build a Minimum Spanning Tree (MST) of an unweighted graph, + by traversing the nodes of a tree in breadth-first order, + discarding eventual cycles. + + Return a representation of this MST as a string in the DOT graph language, + which can be converted to an image by the 'dot' program from the Graphviz + package, or nltk.parse.dependencygraph.dot2img(dot_string). + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + + >>> import nltk + >>> wn=nltk.corpus.wordnet + >>> from nltk.util import unweighted_minimum_spanning_digraph as umsd + >>> print(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) + digraph G { + "Synset('bound.a.01')" -> "Synset('unfree.a.02')"; + "Synset('unfree.a.02')" -> "Synset('confined.a.02')"; + "Synset('unfree.a.02')" -> "Synset('dependent.a.01')"; + "Synset('unfree.a.02')" -> "Synset('restricted.a.01')"; + "Synset('restricted.a.01')" -> "Synset('classified.a.02')"; + } + + """ + return edges2dot( + edge_closure( + tree, lambda node: unweighted_minimum_spanning_dict(tree, children)[node] + ), + shapes, + attr, + ) + + +########################################################################## +# Breadth-First / Depth-first Searches with Cycle Detection +########################################################################## + + +def acyclic_breadth_first(tree, children=iter, maxdepth=-1): + """Traverse the nodes of a tree in breadth-first order, + discarding eventual cycles. + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + """ + traversed = set() + queue = deque([(tree, 0)]) + while queue: + node, depth = queue.popleft() + yield node + traversed.add(node) + if depth != maxdepth: + try: + for child in children(node): + if child not in traversed: + queue.append((child, depth + 1)) + else: + warnings.warn( + "Discarded redundant search for {} at depth {}".format( + child, depth + 1 + ), + stacklevel=2, + ) + except TypeError: + pass + + +def acyclic_depth_first(tree, children=iter, depth=-1, cut_mark=None, traversed=None): + """Traverse the nodes of a tree in depth-first order, + discarding eventual cycles within any branch, + adding cut_mark (when specified) if cycles were truncated. + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + + Catches all cycles: + + >>> import nltk + >>> from nltk.util import acyclic_depth_first as acyclic_tree + >>> wn=nltk.corpus.wordnet + >>> from pprint import pprint + >>> pprint(acyclic_tree(wn.synset('dog.n.01'), lambda s:s.hypernyms(),cut_mark='...')) + [Synset('dog.n.01'), + [Synset('canine.n.02'), + [Synset('carnivore.n.01'), + [Synset('placental.n.01'), + [Synset('mammal.n.01'), + [Synset('vertebrate.n.01'), + [Synset('chordate.n.01'), + [Synset('animal.n.01'), + [Synset('organism.n.01'), + [Synset('living_thing.n.01'), + [Synset('whole.n.02'), + [Synset('object.n.01'), + [Synset('physical_entity.n.01'), + [Synset('entity.n.01')]]]]]]]]]]]]], + [Synset('domestic_animal.n.01'), "Cycle(Synset('animal.n.01'),-3,...)"]] + """ + if traversed is None: + traversed = {tree} + out_tree = [tree] + if depth != 0: + try: + for child in children(tree): + if child not in traversed: + # Recurse with a common "traversed" set for all children: + traversed.add(child) + out_tree += [ + acyclic_depth_first( + child, children, depth - 1, cut_mark, traversed + ) + ] + else: + warnings.warn( + "Discarded redundant search for {} at depth {}".format( + child, depth - 1 + ), + stacklevel=3, + ) + if cut_mark: + out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] + except TypeError: + pass + elif cut_mark: + out_tree += [cut_mark] + return out_tree + + +def acyclic_branches_depth_first( + tree, children=iter, depth=-1, cut_mark=None, traversed=None +): + """Traverse the nodes of a tree in depth-first order, + discarding eventual cycles within the same branch, + but keep duplicate paths in different branches. + Add cut_mark (when defined) if cycles were truncated. + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + + Catches only only cycles within the same branch, + but keeping cycles from different branches: + + >>> import nltk + >>> from nltk.util import acyclic_branches_depth_first as tree + >>> wn=nltk.corpus.wordnet + >>> from pprint import pprint + >>> pprint(tree(wn.synset('certified.a.01'), lambda s:s.also_sees(), cut_mark='...', depth=4)) + [Synset('certified.a.01'), + [Synset('authorized.a.01'), + [Synset('lawful.a.01'), + [Synset('legal.a.01'), + "Cycle(Synset('lawful.a.01'),0,...)", + [Synset('legitimate.a.01'), '...']], + [Synset('straight.a.06'), + [Synset('honest.a.01'), '...'], + "Cycle(Synset('lawful.a.01'),0,...)"]], + [Synset('legitimate.a.01'), + "Cycle(Synset('authorized.a.01'),1,...)", + [Synset('legal.a.01'), + [Synset('lawful.a.01'), '...'], + "Cycle(Synset('legitimate.a.01'),0,...)"], + [Synset('valid.a.01'), + "Cycle(Synset('legitimate.a.01'),0,...)", + [Synset('reasonable.a.01'), '...']]], + [Synset('official.a.01'), "Cycle(Synset('authorized.a.01'),1,...)"]], + [Synset('documented.a.01')]] + """ + if traversed is None: + traversed = {tree} + out_tree = [tree] + if depth != 0: + try: + for child in children(tree): + if child not in traversed: + # Recurse with a different "traversed" set for each child: + out_tree += [ + acyclic_branches_depth_first( + child, + children, + depth - 1, + cut_mark, + traversed.union({child}), + ) + ] + else: + warnings.warn( + "Discarded redundant search for {} at depth {}".format( + child, depth - 1 + ), + stacklevel=3, + ) + if cut_mark: + out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] + except TypeError: + pass + elif cut_mark: + out_tree += [cut_mark] + return out_tree + + +def acyclic_dic2tree(node, dic): + """Convert acyclic dictionary 'dic', where the keys are nodes, and the + values are lists of children, to output tree suitable for pprint(), + starting at root 'node', with subtrees as nested lists.""" + return [node] + [acyclic_dic2tree(child, dic) for child in dic[node]] + + +def unweighted_minimum_spanning_dict(tree, children=iter): + """ + Output a dictionary representing a Minimum Spanning Tree (MST) + of an unweighted graph, by traversing the nodes of a tree in + breadth-first order, discarding eventual cycles. + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + + >>> import nltk + >>> from nltk.corpus import wordnet as wn + >>> from nltk.util import unweighted_minimum_spanning_dict as umsd + >>> from pprint import pprint + >>> pprint(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) + {Synset('bound.a.01'): [Synset('unfree.a.02')], + Synset('classified.a.02'): [], + Synset('confined.a.02'): [], + Synset('dependent.a.01'): [], + Synset('restricted.a.01'): [Synset('classified.a.02')], + Synset('unfree.a.02'): [Synset('confined.a.02'), + Synset('dependent.a.01'), + Synset('restricted.a.01')]} + + """ + traversed = set() # Empty set of traversed nodes + queue = deque([tree]) # Initialize queue + agenda = {tree} # Set of all nodes ever queued + mstdic = {} # Empty MST dictionary + while queue: + node = queue.popleft() # Node is not yet in the MST dictionary, + mstdic[node] = [] # so add it with an empty list of children + if node not in traversed: # Avoid cycles + traversed.add(node) + for child in children(node): + if child not in agenda: # Queue nodes only once + mstdic[node].append(child) # Add child to the MST + queue.append(child) # Add child to queue + agenda.add(child) + return mstdic + + +def unweighted_minimum_spanning_tree(tree, children=iter): + """ + Output a Minimum Spanning Tree (MST) of an unweighted graph, + by traversing the nodes of a tree in breadth-first order, + discarding eventual cycles. + + The first argument should be the tree root; + children should be a function taking as argument a tree node + and returning an iterator of the node's children. + + >>> import nltk + >>> from nltk.util import unweighted_minimum_spanning_tree as mst + >>> wn=nltk.corpus.wordnet + >>> from pprint import pprint + >>> pprint(mst(wn.synset('bound.a.01'), lambda s:s.also_sees())) + [Synset('bound.a.01'), + [Synset('unfree.a.02'), + [Synset('confined.a.02')], + [Synset('dependent.a.01')], + [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] + """ + return acyclic_dic2tree(tree, unweighted_minimum_spanning_dict(tree, children)) + + +########################################################################## +# Guess Character Encoding +########################################################################## + +# adapted from io.py in the docutils extension module (https://docutils.sourceforge.io/) +# http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html + + +def guess_encoding(data): + """ + Given a byte string, attempt to decode it. + Tries the standard 'UTF8' and 'latin-1' encodings, + Plus several gathered from locale information. + + The calling program *must* first call:: + + locale.setlocale(locale.LC_ALL, '') + + If successful it returns ``(decoded_unicode, successful_encoding)``. + If unsuccessful it raises a ``UnicodeError``. + """ + successful_encoding = None + # we make 'utf-8' the first encoding + encodings = ["utf-8"] + # + # next we add anything we can learn from the locale + try: + encodings.append(locale.nl_langinfo(locale.CODESET)) + except AttributeError: + pass + try: + encodings.append(locale.getlocale()[1]) + except (AttributeError, IndexError): + pass + try: + encodings.append(locale.getdefaultlocale()[1]) + except (AttributeError, IndexError): + pass + # + # we try 'latin-1' last + encodings.append("latin-1") + for enc in encodings: + # some of the locale calls + # may have returned None + if not enc: + continue + try: + decoded = str(data, enc) + successful_encoding = enc + + except (UnicodeError, LookupError): + pass + else: + break + if not successful_encoding: + raise UnicodeError( + "Unable to decode input data. " + "Tried the following encodings: %s." + % ", ".join([repr(enc) for enc in encodings if enc]) + ) + else: + return (decoded, successful_encoding) + + +########################################################################## +# Remove repeated elements from a list deterministcally +########################################################################## + + +def unique_list(xs): + seen = set() + # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None. + return [x for x in xs if x not in seen and not seen.add(x)] + + +########################################################################## +# Invert a dictionary +########################################################################## + + +def invert_dict(d): + inverted_dict = defaultdict(list) + for key in d: + if hasattr(d[key], "__iter__"): + for term in d[key]: + inverted_dict[term].append(key) + else: + inverted_dict[d[key]] = key + return inverted_dict + + +########################################################################## +# Utilities for directed graphs: transitive closure, and inversion +# The graph is represented as a dictionary of sets +########################################################################## + + +def transitive_closure(graph, reflexive=False): + """ + Calculate the transitive closure of a directed graph, + optionally the reflexive transitive closure. + + The algorithm is a slight modification of the "Marking Algorithm" of + Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms". + + :param graph: the initial graph, represented as a dictionary of sets + :type graph: dict(set) + :param reflexive: if set, also make the closure reflexive + :type reflexive: bool + :rtype: dict(set) + """ + if reflexive: + base_set = lambda k: {k} + else: + base_set = lambda k: set() + # The graph U_i in the article: + agenda_graph = {k: graph[k].copy() for k in graph} + # The graph M_i in the article: + closure_graph = {k: base_set(k) for k in graph} + for i in graph: + agenda = agenda_graph[i] + closure = closure_graph[i] + while agenda: + j = agenda.pop() + closure.add(j) + closure |= closure_graph.setdefault(j, base_set(j)) + agenda |= agenda_graph.get(j, base_set(j)) + agenda -= closure + return closure_graph + + +def invert_graph(graph): + """ + Inverts a directed graph. + + :param graph: the graph, represented as a dictionary of sets + :type graph: dict(set) + :return: the inverted graph + :rtype: dict(set) + """ + inverted = {} + for key in graph: + for value in graph[key]: + inverted.setdefault(value, set()).add(key) + return inverted + + +########################################################################## +# HTML Cleaning +########################################################################## + + +def clean_html(html): + raise NotImplementedError( + "To remove HTML markup, use BeautifulSoup's get_text() function" + ) + + +def clean_url(url): + raise NotImplementedError( + "To remove HTML markup, use BeautifulSoup's get_text() function" + ) + + +########################################################################## +# FLATTEN LISTS +########################################################################## + + +def flatten(*args): + """ + Flatten a list. + + >>> from nltk.util import flatten + >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3) + [1, 2, 'b', 'a', 'c', 'd', 3] + + :param args: items and lists to be combined into a single list + :rtype: list + """ + + x = [] + for l in args: + if not isinstance(l, (list, tuple)): + l = [l] + for item in l: + if isinstance(item, (list, tuple)): + x.extend(flatten(item)) + else: + x.append(item) + return x + + +########################################################################## +# Ngram iteration +########################################################################## + + +def pad_sequence( + sequence, + n, + pad_left=False, + pad_right=False, + left_pad_symbol=None, + right_pad_symbol=None, +): + """ + Returns a padded sequence of items before ngram extraction. + + >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) + ['', 1, 2, 3, 4, 5, ''] + >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) + ['', 1, 2, 3, 4, 5] + >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) + [1, 2, 3, 4, 5, ''] + + :param sequence: the source data to be padded + :type sequence: sequence or iter + :param n: the degree of the ngrams + :type n: int + :param pad_left: whether the ngrams should be left-padded + :type pad_left: bool + :param pad_right: whether the ngrams should be right-padded + :type pad_right: bool + :param left_pad_symbol: the symbol to use for left padding (default is None) + :type left_pad_symbol: any + :param right_pad_symbol: the symbol to use for right padding (default is None) + :type right_pad_symbol: any + :rtype: sequence or iter + """ + sequence = iter(sequence) + if pad_left: + sequence = chain((left_pad_symbol,) * (n - 1), sequence) + if pad_right: + sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) + return sequence + + +# add a flag to pad the sequence so we get peripheral ngrams? + + +def ngrams(sequence, n, **kwargs): + """ + Return the ngrams generated from a sequence of items, as an iterator. + For example: + + >>> from nltk.util import ngrams + >>> list(ngrams([1,2,3,4,5], 3)) + [(1, 2, 3), (2, 3, 4), (3, 4, 5)] + + Wrap with list for a list version of this function. Set pad_left + or pad_right to true in order to get additional ngrams: + + >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) + [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] + >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) + [(1, 2), (2, 3), (3, 4), (4, 5), (5, '')] + >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) + [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] + >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) + [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] + + + :param sequence: the source data to be converted into ngrams + :type sequence: sequence or iter + :param n: the degree of the ngrams + :type n: int + :param pad_left: whether the ngrams should be left-padded + :type pad_left: bool + :param pad_right: whether the ngrams should be right-padded + :type pad_right: bool + :param left_pad_symbol: the symbol to use for left padding (default is None) + :type left_pad_symbol: any + :param right_pad_symbol: the symbol to use for right padding (default is None) + :type right_pad_symbol: any + :rtype: sequence or iter + """ + sequence = pad_sequence(sequence, n, **kwargs) + + # Creates the sliding window, of n no. of items. + # `iterables` is a tuple of iterables where each iterable is a window of n items. + iterables = tee(sequence, n) + + for i, sub_iterable in enumerate(iterables): # For each window, + for _ in range(i): # iterate through every order of ngrams + next(sub_iterable, None) # generate the ngrams within the window. + return zip(*iterables) # Unpack and flattens the iterables. + + +def bigrams(sequence, **kwargs): + """ + Return the bigrams generated from a sequence of items, as an iterator. + For example: + + >>> from nltk.util import bigrams + >>> list(bigrams([1,2,3,4,5])) + [(1, 2), (2, 3), (3, 4), (4, 5)] + + Use bigrams for a list version of this function. + + :param sequence: the source data to be converted into bigrams + :type sequence: sequence or iter + :rtype: iter(tuple) + """ + + yield from ngrams(sequence, 2, **kwargs) + + +def trigrams(sequence, **kwargs): + """ + Return the trigrams generated from a sequence of items, as an iterator. + For example: + + >>> from nltk.util import trigrams + >>> list(trigrams([1,2,3,4,5])) + [(1, 2, 3), (2, 3, 4), (3, 4, 5)] + + Use trigrams for a list version of this function. + + :param sequence: the source data to be converted into trigrams + :type sequence: sequence or iter + :rtype: iter(tuple) + """ + + yield from ngrams(sequence, 3, **kwargs) + + +def everygrams( + sequence, min_len=1, max_len=-1, pad_left=False, pad_right=False, **kwargs +): + """ + Returns all possible ngrams generated from a sequence of items, as an iterator. + + >>> sent = 'a b c'.split() + + New version outputs for everygrams. + >>> list(everygrams(sent)) + [('a',), ('a', 'b'), ('a', 'b', 'c'), ('b',), ('b', 'c'), ('c',)] + + Old version outputs for everygrams. + >>> sorted(everygrams(sent), key=len) + [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')] + + >>> list(everygrams(sent, max_len=2)) + [('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',)] + + :param sequence: the source data to be converted into ngrams. If max_len is + not provided, this sequence will be loaded into memory + :type sequence: sequence or iter + :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram + :type min_len: int + :param max_len: maximum length of the ngrams (set to length of sequence by default) + :type max_len: int + :param pad_left: whether the ngrams should be left-padded + :type pad_left: bool + :param pad_right: whether the ngrams should be right-padded + :type pad_right: bool + :rtype: iter(tuple) + """ + + # Get max_len for padding. + if max_len == -1: + try: + max_len = len(sequence) + except TypeError: + sequence = list(sequence) + max_len = len(sequence) + + # Pad if indicated using max_len. + sequence = pad_sequence(sequence, max_len, pad_left, pad_right, **kwargs) + + # Sliding window to store grams. + history = list(islice(sequence, max_len)) + + # Yield ngrams from sequence. + while history: + for ngram_len in range(min_len, len(history) + 1): + yield tuple(history[:ngram_len]) + + # Append element to history if sequence has more items. + try: + history.append(next(sequence)) + except StopIteration: + pass + + del history[0] + + +def skipgrams(sequence, n, k, **kwargs): + """ + Returns all possible skipgrams generated from a sequence of items, as an iterator. + Skipgrams are ngrams that allows tokens to be skipped. + Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf + + >>> sent = "Insurgents killed in ongoing fighting".split() + >>> list(skipgrams(sent, 2, 2)) + [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')] + >>> list(skipgrams(sent, 3, 2)) + [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')] + + :param sequence: the source data to be converted into trigrams + :type sequence: sequence or iter + :param n: the degree of the ngrams + :type n: int + :param k: the skip distance + :type k: int + :rtype: iter(tuple) + """ + + # Pads the sequence as desired by **kwargs. + if "pad_left" in kwargs or "pad_right" in kwargs: + sequence = pad_sequence(sequence, n, **kwargs) + + # Note when iterating through the ngrams, the pad_right here is not + # the **kwargs padding, it's for the algorithm to detect the SENTINEL + # object on the right pad to stop inner loop. + SENTINEL = object() + for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL): + head = ngram[:1] + tail = ngram[1:] + for skip_tail in combinations(tail, n - 1): + if skip_tail[-1] is SENTINEL: + continue + yield head + skip_tail + + +###################################################################### +# Binary Search in a File +###################################################################### + +# inherited from pywordnet, by Oliver Steele +def binary_search_file(file, key, cache=None, cacheDepth=-1): + """ + Return the line from the file with first word key. + Searches through a sorted file using the binary search algorithm. + + :type file: file + :param file: the file to be searched through. + :type key: str + :param key: the identifier we are searching for. + """ + + key = key + " " + keylen = len(key) + start = 0 + currentDepth = 0 + + if hasattr(file, "name"): + end = os.stat(file.name).st_size - 1 + else: + file.seek(0, 2) + end = file.tell() - 1 + file.seek(0) + + if cache is None: + cache = {} + + while start < end: + lastState = start, end + middle = (start + end) // 2 + + if cache.get(middle): + offset, line = cache[middle] + + else: + line = "" + while True: + file.seek(max(0, middle - 1)) + if middle > 0: + file.discard_line() + offset = file.tell() + line = file.readline() + if line != "": + break + # at EOF; try to find start of the last line + middle = (start + middle) // 2 + if middle == end - 1: + return None + if currentDepth < cacheDepth: + cache[middle] = (offset, line) + + if offset > end: + assert end != middle - 1, "infinite loop" + end = middle - 1 + elif line[:keylen] == key: + return line + elif line > key: + assert end != middle - 1, "infinite loop" + end = middle - 1 + elif line < key: + start = offset + len(line) - 1 + + currentDepth += 1 + thisState = start, end + + if lastState == thisState: + # Detects the condition where we're searching past the end + # of the file, which is otherwise difficult to detect + return None + + return None + + +###################################################################### +# Proxy configuration +###################################################################### + + +def set_proxy(proxy, user=None, password=""): + """ + Set the HTTP proxy for Python to download through. + + If ``proxy`` is None then tries to set proxy from environment or system + settings. + + :param proxy: The HTTP proxy server to use. For example: + 'http://proxy.example.com:3128/' + :param user: The username to authenticate with. Use None to disable + authentication. + :param password: The password to authenticate with. + """ + if proxy is None: + # Try and find the system proxy settings + try: + proxy = getproxies()["http"] + except KeyError as e: + raise ValueError("Could not detect default proxy settings") from e + + # Set up the proxy handler + proxy_handler = ProxyHandler({"https": proxy, "http": proxy}) + opener = build_opener(proxy_handler) + + if user is not None: + # Set up basic proxy authentication if provided + password_manager = HTTPPasswordMgrWithDefaultRealm() + password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password) + opener.add_handler(ProxyBasicAuthHandler(password_manager)) + opener.add_handler(ProxyDigestAuthHandler(password_manager)) + + # Override the existing url opener + install_opener(opener) + + +###################################################################### +# ElementTree pretty printing from https://www.effbot.org/zone/element-lib.htm +###################################################################### + + +def elementtree_indent(elem, level=0): + """ + Recursive function to indent an ElementTree._ElementInterface + used for pretty printing. Run indent on elem and then output + in the normal way. + + :param elem: element to be indented. will be modified. + :type elem: ElementTree._ElementInterface + :param level: level of indentation for this element + :type level: nonnegative integer + :rtype: ElementTree._ElementInterface + :return: Contents of elem indented to reflect its structure + """ + + i = "\n" + level * " " + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + for elem in elem: + elementtree_indent(elem, level + 1) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + + +###################################################################### +# Mathematical approximations +###################################################################### + + +def choose(n, k): + """ + This function is a fast way to calculate binomial coefficients, commonly + known as nCk, i.e. the number of combinations of n things taken k at a time. + (https://en.wikipedia.org/wiki/Binomial_coefficient). + + This is the *scipy.special.comb()* with long integer computation but this + approximation is faster, see https://github.com/nltk/nltk/issues/1181 + + >>> choose(4, 2) + 6 + >>> choose(6, 2) + 15 + + :param n: The number of things. + :type n: int + :param r: The number of times a thing is taken. + :type r: int + """ + if 0 <= k <= n: + ntok, ktok = 1, 1 + for t in range(1, min(k, n - k) + 1): + ntok *= n + ktok *= t + n -= 1 + return ntok // ktok + else: + return 0 + + +###################################################################### +# Iteration utilities +###################################################################### + + +def pairwise(iterable): + """s -> (s0,s1), (s1,s2), (s2, s3), ...""" + a, b = tee(iterable) + next(b, None) + return zip(a, b) + + +###################################################################### +# Parallelization. +###################################################################### + + +def parallelize_preprocess(func, iterator, processes, progress_bar=False): + from joblib import Parallel, delayed + from tqdm import tqdm + + iterator = tqdm(iterator) if progress_bar else iterator + if processes <= 1: + return map(func, iterator) + return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator) diff --git a/examples/eval_cibench_api.py b/examples/eval_cibench_api.py new file mode 100644 index 0000000000000000000000000000000000000000..4f844b7cc8646b37243f7606263b1ca61549aa54 --- /dev/null +++ b/examples/eval_cibench_api.py @@ -0,0 +1,117 @@ +from lagent.agents.react import ReActProtocol +from mmengine.config import read_base + +from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter +from opencompass.lagent.agents.react import CIReAct +from opencompass.models import OpenAI +from opencompass.models.lagent import CodeAgent +from opencompass.partitioners import NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask + +with read_base(): + from opencompass.configs.datasets.CIBench.CIBench_generation_gen_8ab0dc import \ + cibench_datasets as cibench_datasets_generation + from opencompass.configs.datasets.CIBench.CIBench_template_gen_e6b12a import \ + cibench_datasets as cibench_datasets_template + # Oracle mode for analysis + # from opencompass.configs.datasets.CIBench.CIBench_template_oracle_gen_fecda1 import cibench_datasets as cibench_datasets_template_oracle + # from opencompass.configs.datasets.CIBench.CIBench_generation_oracle_gen_c4a7c1 import cibench_datasets as cibench_datasets_generation_oracle + from opencompass.configs.summarizers.cibench import summarizer + +datasets = [] +datasets += cibench_datasets_template +datasets += cibench_datasets_generation +# datasets += cibench_datasets_template_oracle +# datasets += cibench_datasets_generation_oracle + +FORCE_STOP_PROMPT_EN = """You should directly give results based on history information.""" + +FEWSHOT_INSTRUCTION = """\ +You are an assistant who can utilize external tools. +{tool_description} +To use a tool, please response with the following format: +``` +{thought} Think what you need to solve, do you need to use tools? +{action} The tool name, should be one of [{action_names}]. +{action_input} The input to the tool that you want to use. +``` +The tool will give you response after your response using the following format: +``` +{response} the results after call the tool. +``` +Therefore DO NOT generate tool response by yourself. + +Also please follow the guidelines: +1. Always use code interpreter to solve the problem. +2. The generated codes should always in a markdown code block format. +3. The generated codes will be executed in an ipython manner and the results will be cached. +4. Your responded code should always be simple and only solves the problem in current step. + +For example: + +File url: `xxxx` +### Step 1. Load the dataset from the url into a pandas DataFrame named `df`. + +{thought} We should use `pandas` to solve this step. +{action} IPythonInterpreter +{action_input} ```python +import pandas as pd +url = "xxxx" +data = pd.read_csv(url) +``` +{response} The code is succeed without any outputs. + +Let us begin from here! +""" + +IPYTHON_INTERPRETER_DESCRIPTION = '''\ +It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.''' + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + dict(role='SYSTEM', api_role='SYSTEM'), +], ) + +actions = [ + dict(type=IPythonInterpreter, + user_data_dir='./data/cibench_dataset/datasources', + description=IPYTHON_INTERPRETER_DESCRIPTION) +] +protocol = dict( + type=ReActProtocol, + call_protocol=FEWSHOT_INSTRUCTION, + force_stop=FORCE_STOP_PROMPT_EN, + finish=dict(role='FINISH', begin='Final Answer:', end='\n'), +) + +work_dir = 'outputs/cibench/' +models = [ + dict( + abbr='gpt-4o', + type=CodeAgent, + agent_type=CIReAct, + max_turn=3, + llm=dict( + type=OpenAI, + path='gpt-4o', + rpm_verbose=True, + retry=99, + meta_template=api_meta_template, + query_per_second=1, + max_seq_len=2048, + temperature=0, + ), + actions=actions, + protocol=protocol, + batch_size=1, + ), +] + +infer = dict( + partitioner=dict(type=NaivePartitioner), + runner=dict(type=LocalRunner, + max_num_workers=4, + task=dict(type=OpenICLInferTask)), +) diff --git a/examples/eval_circular.py b/examples/eval_circular.py new file mode 100644 index 0000000000000000000000000000000000000000..783bdc0a3925ba617f50aa86dd8b17c9d566bc75 --- /dev/null +++ b/examples/eval_circular.py @@ -0,0 +1,115 @@ +from mmengine.config import read_base + +from opencompass.datasets.circular import ( + CircularARCDataset, CircularCEvalDataset, CircularCMMLUDataset, + CircularCSQADataset, CircularEvaluator, CircularHSWAGDataset, + CircularMMLUDataset, CircularOBQADataset, CircularRaceDataset) +from opencompass.summarizers import CircularSummarizer + +with read_base(): + from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import \ + ARC_c_datasets + from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import \ + ARC_e_datasets + from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import \ + ceval_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_gen_c13365 import \ + cmmlu_datasets + from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import \ + commonsenseqa_datasets + from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import \ + hellaswag_datasets + from opencompass.configs.datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets + from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets + from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets + from opencompass.configs.models.hf_internlm.hf_internlm_chat_7b import \ + models as hf_internlm_chat_7b_model + from opencompass.configs.models.hf_internlm.hf_internlm_chat_20b import \ + models as hf_internlm_chat_20b_model + from opencompass.configs.models.qwen.hf_qwen_7b_chat import \ + models as hf_qwen_7b_chat_model + from opencompass.configs.models.qwen.hf_qwen_14b_chat import \ + models as hf_qwen_14b_chat_model + from opencompass.configs.summarizers.groups.ceval import \ + ceval_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + +for ds, t in [ + (ceval_datasets, CircularCEvalDataset), + (mmlu_datasets, CircularMMLUDataset), + (cmmlu_datasets, CircularCMMLUDataset), + (hellaswag_datasets, CircularHSWAGDataset), + (ARC_e_datasets, CircularARCDataset), + (ARC_c_datasets, CircularARCDataset), + (commonsenseqa_datasets, CircularCSQADataset), + (obqa_datasets, CircularOBQADataset), + (race_datasets, CircularRaceDataset), +]: + for d in ds: + d['type'] = t + d['abbr'] = d['abbr'] + '-circular-4' + d['eval_cfg']['evaluator'] = { + 'type': CircularEvaluator, + 'circular_pattern': 'circular' + } + d['circular_patterns'] = 'circular' + +datasets = sum([ + v + for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets' +], []) +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +# config summarizer +other_summary_groups = [ + { + 'name': + 'average', + 'subsets': [ + 'ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', + 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high' + ] + }, +] +origin_summary_groups = sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []) +new_summary_groups = [] +for item in origin_summary_groups: + new_summary_groups.append({ + 'name': + item['name'] + '-circular-4', + 'subsets': [i + '-circular-4' for i in item['subsets']], + }) +summarizer = dict( + type=CircularSummarizer, + metric_types=['acc_origin', 'perf_circular'], + dataset_abbrs=[ + 'average-circular-4', + 'ceval-circular-4', + 'mmlu-circular-4', + 'cmmlu-circular-4', + 'hellaswag-circular-4', + 'ARC-e-circular-4', + 'ARC-c-circular-4', + 'commonsense_qa-circular-4', + 'openbookqa_fact-circular-4', + 'race-middle-circular-4', + 'race-high-circular-4', + 'ceval-humanities-circular-4', + 'ceval-stem-circular-4', + 'ceval-social-science-circular-4', + 'ceval-other-circular-4', + 'mmlu-humanities-circular-4', + 'mmlu-stem-circular-4', + 'mmlu-social-science-circular-4', + 'mmlu-other-circular-4', + 'cmmlu-humanities-circular-4', + 'cmmlu-stem-circular-4', + 'cmmlu-social-science-circular-4', + 'cmmlu-other-circular-4', + 'cmmlu-china-specific-circular-4', + ], + summary_groups=new_summary_groups, +) diff --git a/examples/eval_corebench_2409_chat_objective.py b/examples/eval_corebench_2409_chat_objective.py new file mode 100644 index 0000000000000000000000000000000000000000..fe00457a6030e446cf0ee551f87aeedae77aeca5 --- /dev/null +++ b/examples/eval_corebench_2409_chat_objective.py @@ -0,0 +1,208 @@ +import os.path as osp + +from mmengine.config import read_base + +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + ## Core Set + # ## Examination + # ## Reasoning + from opencompass.configs.datasets.bbh.bbh_gen_4a31fa import bbh_datasets + from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \ + cmmlu_datasets + from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \ + drop_datasets + # ## Scientific + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets + from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_a58960 import \ + gsm8k_datasets + from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \ + hellaswag_datasets + # ## Coding + from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import \ + humaneval_datasets + # TODO: Add LiveCodeBench + # ## Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_3321a3 import \ + ifeval_datasets + # ## Math + from opencompass.configs.datasets.math.math_0shot_gen_393424 import \ + math_datasets + from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \ + mathbench_datasets + from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \ + sanitized_mbpp_datasets + from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \ + mmlu_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + from opencompass.configs.summarizers.groups.cmmlu import \ + cmmlu_summary_groups + # Summarizer + from opencompass.configs.summarizers.groups.mmlu import mmlu_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups + + # Model List + # from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import models as lmdeploy_qwen2_1_5b_instruct_model + # from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import models as hf_internlm2_5_7b_chat_model + # from opencompass.configs.models.openbmb.hf_minicpm_2b_sft_bf16 import models as hf_minicpm_2b_sft_bf16_model + # from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import models as hf_yi_1_5_6b_chat_model + # from opencompass.configs.models.gemma.hf_gemma_2b_it import models as hf_gemma_2b_it_model + # from opencompass.configs.models.yi.hf_yi_1_5_34b_chat import models as hf_yi_1_5_34b_chat_model + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### +# with read_base(): + +core_summary_groups = [ + { + 'name': + 'core_average', + 'subsets': [['mmlu', 'accuracy'], ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], ['hellaswag', 'accuracy'], + ['mathbench-t (average)', 'naive_average']], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + ['mmlu', 'accuracy'], + ['mmlu_pro', 'accuracy'], + ['cmmlu', 'accuracy'], + ['bbh', 'score'], + ['math', 'accuracy'], + ['openai_humaneval', 'humaneval_pass@1'], + ['GPQA_diamond', 'accuracy'], + ['IFEval', 'Prompt-level-strict-accuracy'], + ['drop', 'accuracy'], + ['sanitized_mbpp', 'score'], + ['gsm8k', 'accuracy'], + ['hellaswag', 'accuracy'], + 'mathbench-a (average)', + 'mathbench-t (average)' + '', + ['mmlu', 'accuracy'], + ['mmlu-stem', 'accuracy'], + ['mmlu-social-science', 'accuracy'], + ['mmlu-humanities', 'accuracy'], + ['mmlu-other', 'accuracy'], + '', + ['mmlu_pro', 'accuracy'], + ['mmlu_pro_math', 'accuracy'], + ['mmlu_pro_physics', 'accuracy'], + ['mmlu_pro_chemistry', 'accuracy'], + ['mmlu_pro_law', 'accuracy'], + ['mmlu_pro_engineering', 'accuracy'], + ['mmlu_pro_other', 'accuracy'], + ['mmlu_pro_economics', 'accuracy'], + ['mmlu_pro_health', 'accuracy'], + ['mmlu_pro_psychology', 'accuracy'], + ['mmlu_pro_business', 'accuracy'], + ['mmlu_pro_biology', 'accuracy'], + ['mmlu_pro_philosophy', 'accuracy'], + ['mmlu_pro_computer_science', 'accuracy'], + ['mmlu_pro_history', 'accuracy'], + '', + ['cmmlu', 'accuracy'], + ['cmmlu-stem', 'accuracy'], + ['cmmlu-social-science', 'accuracy'], + ['cmmlu-humanities', 'accuracy'], + ['cmmlu-other', 'accuracy'], + ['cmmlu-china-specific', 'accuracy'], + '', + ['bbh', 'extract_rate'], + ['math', 'extract_rate'], + # ['openai_humaneval', 'extract_rate'], + ['GPQA_diamond', 'extract_rate'], + # ['IFEval', 'extract_rate'], + '', + ['mmlu', 'extract_rate'], + ['mmlu-stem', 'extract_rate'], + ['mmlu-social-science', 'extract_rate'], + ['mmlu-humanities', 'extract_rate'], + ['mmlu-other', 'extract_rate'], + '', + ['mmlu_pro', 'extract_rate'], + ['mmlu_pro_math', 'extract_rate'], + ['mmlu_pro_physics', 'extract_rate'], + ['mmlu_pro_chemistry', 'extract_rate'], + ['mmlu_pro_law', 'extract_rate'], + ['mmlu_pro_engineering', 'extract_rate'], + ['mmlu_pro_other', 'extract_rate'], + ['mmlu_pro_economics', 'extract_rate'], + ['mmlu_pro_health', 'extract_rate'], + ['mmlu_pro_psychology', 'extract_rate'], + ['mmlu_pro_business', 'extract_rate'], + ['mmlu_pro_biology', 'extract_rate'], + ['mmlu_pro_philosophy', 'extract_rate'], + ['mmlu_pro_computer_science', 'extract_rate'], + ['mmlu_pro_history', 'extract_rate'], + '', + ['cmmlu', 'extract_rate'], + ['cmmlu-stem', 'extract_rate'], + ['cmmlu-social-science', 'extract_rate'], + ['cmmlu-humanities', 'extract_rate'], + ['cmmlu-other', 'extract_rate'], + ['cmmlu-china-specific', 'extract_rate'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask)), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +base_exp_dir = 'outputs/corebench_2409_objective/' +work_dir = osp.join(base_exp_dir, 'chat_objective') diff --git a/examples/eval_dingo.py b/examples/eval_dingo.py new file mode 100644 index 0000000000000000000000000000000000000000..899eaa291d32bfd3088f3c30c5322c85be2d37e3 --- /dev/null +++ b/examples/eval_dingo.py @@ -0,0 +1,7 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.dingo.dingo_gen import datasets + from opencompass.configs.models.hf_internlm.hf_internlm_7b import models + +work_dir = './outputs/eval_dingo' diff --git a/examples/eval_llama2_7b.py b/examples/eval_llama2_7b.py new file mode 100644 index 0000000000000000000000000000000000000000..fcc375866369f905a5826f15297b4e6a2a7dfcd8 --- /dev/null +++ b/examples/eval_llama2_7b.py @@ -0,0 +1,8 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.collections.base_medium_llama import ( + piqa_datasets, siqa_datasets) + from opencompass.configs.models.llama.llama2_7b import models + +datasets = [*piqa_datasets, *siqa_datasets] diff --git a/examples/eval_llama2_7b_lveval.py b/examples/eval_llama2_7b_lveval.py new file mode 100644 index 0000000000000000000000000000000000000000..813d8aa3c04d17fb846ac778a07bd26a807091a0 --- /dev/null +++ b/examples/eval_llama2_7b_lveval.py @@ -0,0 +1,14 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.lveval.lveval import \ + LVEval_datasets as datasets + from opencompass.configs.models.hf_llama.hf_llama2_7b_chat import models + from opencompass.configs.summarizers.lveval import summarizer + +models[0]['path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf' +models[0][ + 'tokenizer_path'] = '/path/to/your/huggingface_models/Llama-2-7b-chat-hf' +models[0]['max_seq_len'] = 4096 +models[0]['generation_kwargs'] = dict(do_sample=False) +models[0]['mode'] = 'mid' # truncate in the middle diff --git a/examples/eval_llama3_instruct.py b/examples/eval_llama3_instruct.py new file mode 100644 index 0000000000000000000000000000000000000000..e92dfed96f66f9b6b36c66251ac65de76965db7a --- /dev/null +++ b/examples/eval_llama3_instruct.py @@ -0,0 +1,50 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.dataset_collections.chat_OC15 import datasets + from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \ + models as hf_llama3_8b_instruct_model + from opencompass.configs.summarizers.chat_OC15 import summarizer + +work_dir = 'outputs/debug/llama3-instruct' + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +# dataset version metric mode llama-3-8b-instruct-hf +# -------------------- --------- ---------------------------- ------ ------------------------ +# average - naive_average gen 55.64 +# mmlu - naive_average gen 68.30 +# cmmlu - naive_average gen 53.29 +# ceval - naive_average gen 52.32 +# GaokaoBench - weighted_average gen 45.91 +# triviaqa_wiki_1shot eaf81e score gen 79.01 +# nq_open_1shot 01cf41 score gen 30.25 +# race-high 9a54b6 accuracy gen 81.22 +# winogrande b36770 accuracy gen 66.46 +# hellaswag e42710 accuracy gen 74.33 +# bbh - naive_average gen 67.25 +# gsm8k 1d7fe4 accuracy gen 79.08 +# math 393424 accuracy gen 27.78 +# TheoremQA 6f0af8 score gen 19.50 +# openai_humaneval 8e312c humaneval_pass@1 gen 55.49 +# sanitized_mbpp 830460 score gen 66.54 +# GPQA_diamond 4baadb accuracy gen 25.76 +# IFEval 3321a3 Prompt-level-strict-accuracy gen 67.84 +# - - - - +# mmlu - naive_average gen 68.30 +# mmlu-stem - naive_average gen 57.92 +# mmlu-social-science - naive_average gen 77.83 +# mmlu-humanities - naive_average gen 71.20 +# mmlu-other - naive_average gen 71.79 +# cmmlu - naive_average gen 53.29 +# cmmlu-stem - naive_average gen 45.40 +# cmmlu-social-science - naive_average gen 54.63 +# cmmlu-humanities - naive_average gen 54.14 +# cmmlu-other - naive_average gen 59.52 +# cmmlu-china-specific - naive_average gen 49.33 +# ceval - naive_average gen 52.32 +# ceval-stem - naive_average gen 48.16 +# ceval-social-science - naive_average gen 57.50 +# ceval-humanities - naive_average gen 53.26 +# ceval-other - naive_average gen 54.26 +# ceval-hard - naive_average gen 35.59 diff --git a/examples/eval_llm_compression.py b/examples/eval_llm_compression.py new file mode 100644 index 0000000000000000000000000000000000000000..70fe5979267de024cd6fc4e2a8e1415b604ca020 --- /dev/null +++ b/examples/eval_llm_compression.py @@ -0,0 +1,55 @@ +from mmengine.config import read_base + +with read_base(): + # LLM compression datasets + from opencompass.configs.datasets.llm_compression.llm_compression import llm_compression_datasets + + # Model configs + from opencompass.configs.models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b + from opencompass.configs.models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b + from opencompass.configs.models.hf_llama.hf_llama2_7b import models as llama2_7b + from opencompass.configs.models.hf_llama.hf_llama2_13b import models as llama2_13b + +from opencompass.partitioners import NaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.summarizers import LLMCompressionSummarizer +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + +# -------------Inference Stage ---------------------------------------- +datasets = [*llm_compression_datasets] +workdir = 'outputs/llm_compression' + +models = [ + *qwen1_5_7b, + *qwen1_5_14b, + *llama2_7b, + *llama2_13b, +] + +# Set custom batch_size and num_gpus for faster loss calculation +# Smaller batch_size should give more precise results, at the cost of worse performance +model_cfg = dict(batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1)) + +for mdl in models: + mdl.update(model_cfg) + +infer = dict( + # The OpenCompass implementation of BPC currently only supports NaivePartitioner, as the sliding window approach requires the dataset to be loaded sequentially. Using other partitioner types may produce incorrect results. + partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=OpenICLInferTask), + max_num_workers=256, # Maximum concurrent evaluation task count + ), +) + +# -------------Evaluation Stage ---------------------------------------- +eval = dict(partitioner=dict(type=NaivePartitioner), + runner=dict( + type=LocalRunner, + task=dict(type=OpenICLEvalTask), + max_num_workers=256, + )) + +# -------------Summarization Stage ---------------------------------------- +summarizer = dict(type=LLMCompressionSummarizer) diff --git a/tmp/81fc8aa5-7d03-48fa-b064-0f275ce8d0a8_params.py b/tmp/81fc8aa5-7d03-48fa-b064-0f275ce8d0a8_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e3c8f7a4fd256b0c1191c657488613497adbcaf0 --- /dev/null +++ b/tmp/81fc8aa5-7d03-48fa-b064-0f275ce8d0a8_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_6', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[6000:7000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_190244' diff --git a/tmp/821e04c7-a51d-4c35-a17e-c0c8a2687922_params.py b/tmp/821e04c7-a51d-4c35-a17e-c0c8a2687922_params.py new file mode 100644 index 0000000000000000000000000000000000000000..905b1cfba41a700b00b09513d5ca2efa66b1da28 --- /dev/null +++ b/tmp/821e04c7-a51d-4c35-a17e-c0c8a2687922_params.py @@ -0,0 +1,54 @@ +datasets = [ + [ + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='retnet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/retnet-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/retnet-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_223306' diff --git a/tmp/82b65cdc-50bc-45e2-b8d6-0afb653aec84_params.py b/tmp/82b65cdc-50bc-45e2-b8d6-0afb653aec84_params.py new file mode 100644 index 0000000000000000000000000000000000000000..c1dab2de09e88c774964c03e76bb33d041dc1dbf --- /dev/null +++ b/tmp/82b65cdc-50bc-45e2-b8d6-0afb653aec84_params.py @@ -0,0 +1,53 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='retnet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/retnet-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/retnet-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_223306' diff --git a/tmp/8402843f-f3de-4c52-8d9f-e92df07a79aa_params.py b/tmp/8402843f-f3de-4c52-8d9f-e92df07a79aa_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/842c377a-7cbb-4231-a917-17b662cc254f_params.py b/tmp/842c377a-7cbb-4231-a917-17b662cc254f_params.py new file mode 100644 index 0000000000000000000000000000000000000000..18776189d956d7750c55fa5bb3f4908897d830fe --- /dev/null +++ b/tmp/842c377a-7cbb-4231-a917-17b662cc254f_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_4', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[4000:5000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_193336' diff --git a/tmp/85005fd7-a287-4e05-9c58-9dd042447943_params.py b/tmp/85005fd7-a287-4e05-9c58-9dd042447943_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/856be67c-11cf-4d3e-b7f9-e4dc4028faf1_params.py b/tmp/856be67c-11cf-4d3e-b7f9-e4dc4028faf1_params.py new file mode 100644 index 0000000000000000000000000000000000000000..8e3bbb582211947b89f849ce9d0fcda5d65c9458 --- /dev/null +++ b/tmp/856be67c-11cf-4d3e-b7f9-e4dc4028faf1_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_221150' diff --git a/tmp/8724d010-ca31-4949-9acb-787fe0b0dd6f_params.py b/tmp/8724d010-ca31-4949-9acb-787fe0b0dd6f_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/8d85bc99-0f59-4933-9afa-8c666678c2d5_params.py b/tmp/8d85bc99-0f59-4933-9afa-8c666678c2d5_params.py new file mode 100644 index 0000000000000000000000000000000000000000..c6a4f32f67bab990a8d50ee79f9de844b664d36a --- /dev/null +++ b/tmp/8d85bc99-0f59-4933-9afa-8c666678c2d5_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_1', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[1000:2000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_193336' diff --git a/tmp/905ba635-1d1a-41cd-8cc3-82d909f7d53b_params.py b/tmp/905ba635-1d1a-41cd-8cc3-82d909f7d53b_params.py new file mode 100644 index 0000000000000000000000000000000000000000..1f880768423ee1d7920e46d68ae120478de12d31 --- /dev/null +++ b/tmp/905ba635-1d1a-41cd-8cc3-82d909f7d53b_params.py @@ -0,0 +1,50 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164548' diff --git a/tmp/910e81cb-6246-45dc-8fc5-f3e454ea8e3d_params.py b/tmp/910e81cb-6246-45dc-8fc5-f3e454ea8e3d_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/923a2981-382e-4ed3-817a-96babb5897f2_params.py b/tmp/923a2981-382e-4ed3-817a-96babb5897f2_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/924cab89-44ce-4649-a91a-496596c97272_params.py b/tmp/924cab89-44ce-4649-a91a-496596c97272_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e14e794c284ee55239907cc837a70ac931645aba --- /dev/null +++ b/tmp/924cab89-44ce-4649-a91a-496596c97272_params.py @@ -0,0 +1,56 @@ +datasets = [ + [ + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/train_exp/mask_deltanet_1B_rank4', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251128_162747' diff --git a/tmp/94acbd97-b3b2-4331-9997-18ce0dcd9a24_params.py b/tmp/94acbd97-b3b2-4331-9997-18ce0dcd9a24_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e7c92c31b3a299d34c7c3728a282102358f212d8 --- /dev/null +++ b/tmp/94acbd97-b3b2-4331-9997-18ce0dcd9a24_params.py @@ -0,0 +1,1420 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[133:152]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[441:504]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[441:504]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[133:152]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[441:504]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[441:504]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_7', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_7', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[175:200]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B', + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_163453' diff --git a/tmp/958db9e2-c338-4641-9a63-72c99be25f5e_params.py b/tmp/958db9e2-c338-4641-9a63-72c99be25f5e_params.py new file mode 100644 index 0000000000000000000000000000000000000000..5b5d1066e0130b99348ddf52332ee25ce4491f9d --- /dev/null +++ b/tmp/958db9e2-c338-4641-9a63-72c99be25f5e_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_230930' diff --git a/tmp/960b3d37-c065-45b5-97a8-f5252b63977f_params.py b/tmp/960b3d37-c065-45b5-97a8-f5252b63977f_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/96d8fd23-49b3-4377-8d7b-ac9837b57b2b_params.py b/tmp/96d8fd23-49b3-4377-8d7b-ac9837b57b2b_params.py new file mode 100644 index 0000000000000000000000000000000000000000..c0afbba520cc612c9749c20fd66004486ff74318 --- /dev/null +++ b/tmp/96d8fd23-49b3-4377-8d7b-ac9837b57b2b_params.py @@ -0,0 +1,53 @@ +datasets = [ + [ + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='gated_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='download_model/hgrn2-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='download_model/hgrn2-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251219_163447' diff --git a/tmp/997a8731-ec9d-4b11-b014-57c1818f56ad_params.py b/tmp/997a8731-ec9d-4b11-b014-57c1818f56ad_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b4e2fbb801fcaa7dcd158f908a1338636be37e1d --- /dev/null +++ b/tmp/997a8731-ec9d-4b11-b014-57c1818f56ad_params.py @@ -0,0 +1,1382 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_220048' diff --git a/tmp/9a2c3c5a-cc9b-4476-b0eb-8c830e569465_params.py b/tmp/9a2c3c5a-cc9b-4476-b0eb-8c830e569465_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/9d21f71f-e607-4df4-ba2d-60d37cc6db97_params.py b/tmp/9d21f71f-e607-4df4-ba2d-60d37cc6db97_params.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc9ce1a8d5dbcf3ee3d1ae7e4cb6c88fb8ad6a7 --- /dev/null +++ b/tmp/9d21f71f-e607-4df4-ba2d-60d37cc6db97_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/9d7ff971-5cc2-4902-b214-f6075e2a0fc5_params.py b/tmp/9d7ff971-5cc2-4902-b214-f6075e2a0fc5_params.py new file mode 100644 index 0000000000000000000000000000000000000000..8211f8ba30640a0fa7ac2594534cc8afa269c97c --- /dev/null +++ b/tmp/9d7ff971-5cc2-4902-b214-f6075e2a0fc5_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_230930' diff --git a/tmp/a0a54c4c-e7ab-4cdb-af19-f44409d7ad60_params.py b/tmp/a0a54c4c-e7ab-4cdb-af19-f44409d7ad60_params.py new file mode 100644 index 0000000000000000000000000000000000000000..11f8615b44540b440a1d794e9382fca64d21a706 --- /dev/null +++ b/tmp/a0a54c4c-e7ab-4cdb-af19-f44409d7ad60_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[38:57]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[126:189]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_2', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_2', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[50:75]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_230930' diff --git a/tmp/a330de69-f57c-4c7e-9aac-18f6959dd2ff_params.py b/tmp/a330de69-f57c-4c7e-9aac-18f6959dd2ff_params.py new file mode 100644 index 0000000000000000000000000000000000000000..3e1de39f4a7d70991955948ad99a4e3b13c34ba9 --- /dev/null +++ b/tmp/a330de69-f57c-4c7e-9aac-18f6959dd2ff_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_223020' diff --git a/tmp/a3753eb4-2450-452d-9f59-e314fbd74a38_params.py b/tmp/a3753eb4-2450-452d-9f59-e314fbd74a38_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/a3ec17b0-d4bb-4970-8ff0-89a53d8373d0_params.py b/tmp/a3ec17b0-d4bb-4970-8ff0-89a53d8373d0_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/a58afdb6-e602-4f79-91b0-b96d9c82a790_params.py b/tmp/a58afdb6-e602-4f79-91b0-b96d9c82a790_params.py new file mode 100644 index 0000000000000000000000000000000000000000..835e253ed95b85134b16f89be2f56248694e569e --- /dev/null +++ b/tmp/a58afdb6-e602-4f79-91b0-b96d9c82a790_params.py @@ -0,0 +1,61 @@ +datasets = [ + [ + dict( + abbr='triviaqa_wiki_1shot_0', + eval_cfg=dict( + evaluator=dict(type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + template='Q: {question}\nA: {answer}.\n', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=50, + stopping_criteria=[ + 'Q:', + '\n', + ], + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + ice_token='', + template='Q: {question}\nA: ', + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + fix_id_list=[ + 0, + ], + type='opencompass.openicl.icl_retriever.FixKRetriever')), + path='opencompass/trivia_qa', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[0:1000]', + test_split='validation', + train_split='train'), + type='opencompass.datasets.TriviaQADatasetV2'), + ], +] +models = [ + dict( + abbr='mask_gdn_1B_hrr-rank4_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type='opencompass.models.huggingface_above_v4_33.HuggingFaceBaseModel' + ), +] +work_dir = 'outputs/default/20251127_193336' diff --git a/tmp/aa1fda67-6fdd-40d6-8fd2-af61c618e3f2_params.py b/tmp/aa1fda67-6fdd-40d6-8fd2-af61c618e3f2_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b76c134b6d4048243125138829aae71f566bb4b0 --- /dev/null +++ b/tmp/aa1fda67-6fdd-40d6-8fd2-af61c618e3f2_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_221150' diff --git a/tmp/aa7e90fd-e798-41e3-9103-1f6c5c423363_params.py b/tmp/aa7e90fd-e798-41e3-9103-1f6c5c423363_params.py new file mode 100644 index 0000000000000000000000000000000000000000..86d53eea4af5b28fb1f45ef2c5b2f53f1dce366e --- /dev/null +++ b/tmp/aa7e90fd-e798-41e3-9103-1f6c5c423363_params.py @@ -0,0 +1,55 @@ +datasets = [ + [ + dict( + abbr='LongBench_samsum', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='gated_deltanet', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='download_model/hgrn2-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='download_model/hgrn2-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251219_163447' diff --git a/tmp/abc42757-c259-41b7-b6da-2ed32a293d52_params.py b/tmp/abc42757-c259-41b7-b6da-2ed32a293d52_params.py new file mode 100644 index 0000000000000000000000000000000000000000..86186420fbc05fd9323ffeda6994939bee96d296 --- /dev/null +++ b/tmp/abc42757-c259-41b7-b6da-2ed32a293d52_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[19:38]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[63:126]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_1', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_1', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[25:50]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/accd6f4c-627c-45cd-a85c-6a4fb913ec4a_params.py b/tmp/accd6f4c-627c-45cd-a85c-6a4fb913ec4a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..93c43e8cc5cb23c07c7bc035862319678534e524 --- /dev/null +++ b/tmp/accd6f4c-627c-45cd-a85c-6a4fb913ec4a_params.py @@ -0,0 +1,1421 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[95:114]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[315:378]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_5', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_5', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[125:150]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164744' diff --git a/tmp/b0712239-33cc-4fc1-b158-346ac4e4443a_params.py b/tmp/b0712239-33cc-4fc1-b158-346ac4e4443a_params.py new file mode 100644 index 0000000000000000000000000000000000000000..40d221b34054b777c62c7998af3ca99676998082 --- /dev/null +++ b/tmp/b0712239-33cc-4fc1-b158-346ac4e4443a_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[76:95]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[252:315]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_4', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_4', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[100:125]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251127_223020' diff --git a/tmp/b0f24ca4-89a1-47f4-9e91-2812b0c7c7cc_params.py b/tmp/b0f24ca4-89a1-47f4-9e91-2812b0c7c7cc_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/b4488168-54ba-414f-b27c-a03af1ad9135_params.py b/tmp/b4488168-54ba-414f-b27c-a03af1ad9135_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/b4da0723-a261-4a1c-b67a-7f2bc8dce6ef_params.py b/tmp/b4da0723-a261-4a1c-b67a-7f2bc8dce6ef_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/ec9a62c3-a1a6-45f8-98ef-8ca755fd5eb8_params.py b/tmp/ec9a62c3-a1a6-45f8-98ef-8ca755fd5eb8_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/ef0a8ff7-f64f-4d84-b902-80fa9ebb7dac_params.py b/tmp/ef0a8ff7-f64f-4d84-b902-80fa9ebb7dac_params.py new file mode 100644 index 0000000000000000000000000000000000000000..4040392532e82b68c394766665d152f04d6e2451 --- /dev/null +++ b/tmp/ef0a8ff7-f64f-4d84-b902-80fa9ebb7dac_params.py @@ -0,0 +1,1424 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net', + batch_size=128, + max_seq_len=2048, + model_kwargs=dict( + device_map='auto', + torch_dtype='torch.bfloat16', + trust_remote_code=True), + path='/mnt/jfzn/msj/gla-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left'), + tokenizer_path='/mnt/jfzn/msj/gla-1.3B-100B', + type='opencompass.models.HuggingFaceBaseModel'), +] +work_dir = 'outputs/default/20251207_205110' diff --git a/tmp/f10fd862-2cad-4087-b5c1-e6203a062849_params.py b/tmp/f10fd862-2cad-4087-b5c1-e6203a062849_params.py new file mode 100644 index 0000000000000000000000000000000000000000..08a0c703ed465601fc1b1ba1e279ba53d210dd4d --- /dev/null +++ b/tmp/f10fd862-2cad-4087-b5c1-e6203a062849_params.py @@ -0,0 +1,88 @@ +datasets = [ + [ + dict( + abbr='demo_gsm8k', + eval_cfg=dict( + dataset_postprocessor=dict( + type='opencompass.datasets.gsm8k_dataset_postprocess'), + evaluator=dict(type='opencompass.datasets.Gsm8kEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.gsm8k_postprocess')), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + "Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:", + role='HUMAN'), + dict( + prompt= + 'Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n', + role='BOT'), + dict( + prompt= + "Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:", + role='HUMAN'), + dict( + prompt= + "Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n", + role='BOT'), + dict( + prompt= + "Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:", + role='HUMAN'), + dict( + prompt= + "When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n", + role='BOT'), + dict( + prompt= + "Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:", + role='HUMAN'), + dict( + prompt= + 'For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n', + role='BOT'), + dict( + prompt= + "Question: {question}\nLet's think step by step\nAnswer:", + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + path='opencompass/gsm8k', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_range='[0:64]'), + type='opencompass.datasets.GSM8KDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B-100B_hf', + batch_size=8, + generation_kwargs=dict(), + max_out_len=256, + max_seq_len=None, + model_kwargs=dict(), + pad_token_id=None, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + peft_kwargs=dict(), + peft_path=None, + run_cfg=dict(num_gpus=1), + stop_words=[], + tokenizer_kwargs=dict(), + tokenizer_path=None, + type= + 'opencompass.models.huggingface_above_v4_33.HuggingFacewithChatTemplate' + ), +] +work_dir = 'outputs/default/20251127_145531' diff --git a/tmp/f30036b2-455a-491f-ae01-c1a4adf24d49_params.py b/tmp/f30036b2-455a-491f-ae01-c1a4adf24d49_params.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tmp/f43bbade-7759-427f-8a05-80db82e97fa9_params.py b/tmp/f43bbade-7759-427f-8a05-80db82e97fa9_params.py new file mode 100644 index 0000000000000000000000000000000000000000..d1e149e57cc8ab0f165cf85143841473477e0910 --- /dev/null +++ b/tmp/f43bbade-7759-427f-8a05-80db82e97fa9_params.py @@ -0,0 +1,51 @@ +datasets = [ + [ + dict( + abbr='LongBench_multifieldqa_zh', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + ], +] +eval = dict(runner=dict(task=dict(dump_details=True))) +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164548' diff --git a/tmp/f5f6141a-69be-4060-8631-538cbd3d25b2_params.py b/tmp/f5f6141a-69be-4060-8631-538cbd3d25b2_params.py new file mode 100644 index 0000000000000000000000000000000000000000..b375d873306feb713f52ee1c343f195eb87dd845 --- /dev/null +++ b/tmp/f5f6141a-69be-4060-8631-538cbd3d25b2_params.py @@ -0,0 +1,1420 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:19]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:63]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_0', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_0', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[0:25]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='delta_net-1.3B', + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/delta_net-1.3B-100B', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/delta_net-1.3B-100B', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_163453' diff --git a/tmp/f761d57d-d312-4694-80fc-fa5796ba8e78_params.py b/tmp/f761d57d-d312-4694-80fc-fa5796ba8e78_params.py new file mode 100644 index 0000000000000000000000000000000000000000..18d18c0facacb88076fbae0ed080f1b53d7dbb51 --- /dev/null +++ b/tmp/f761d57d-d312-4694-80fc-fa5796ba8e78_params.py @@ -0,0 +1,1421 @@ +datasets = [ + [ + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + dict( + abbr='LongBench_2wikimqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='2wikimqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBench2wikimqaDataset'), + dict( + abbr='LongBench_hotpotqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='hotpotqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchhotpotqaDataset'), + dict( + abbr='LongBench_musique_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='musique', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmusiqueDataset'), + dict( + abbr='LongBench_multifieldqa_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[114:133]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_enDataset'), + dict( + abbr='LongBench_multifieldqa_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multifieldqa_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmultifieldqa_zhDataset'), + dict( + abbr='LongBench_narrativeqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a story, which can be either a novel or a movie script, and a question. Answer the question as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story as concisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='narrativeqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchnarrativeqaDataset'), + dict( + abbr='LongBench_qasper_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qasper', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqasperDataset'), + dict( + abbr='LongBench_triviaqa_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchF1Evaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.triviaqa_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='triviaqa', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtriviaqaDataset'), + dict( + abbr='LongBench_gov_report_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='gov_report', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchgov_reportDataset'), + dict( + abbr='LongBench_qmsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='qmsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchqmsumDataset'), + dict( + abbr='LongBench_vcsum_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='vcsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchvcsumDataset'), + dict( + abbr='LongBench_dureader_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='dureader', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchdureaderDataset'), + dict( + abbr='LongBench_lcc_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lcc', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlccDataset'), + dict( + abbr='LongBench_repobench-p_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCodeSimEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please complete the code given below. \n{context}{input}Next line of code:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='repobench-p', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[378:441]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchrepobenchDataset'), + dict( + abbr='LongBench_passage_retrieval_en_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like "Paragraph 1", "Paragraph 2", etc.\n\nThe answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_en', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_enDataset'), + dict( + abbr='LongBench_passage_retrieval_zh_6', + eval_cfg=dict( + evaluator=dict( + language='zh', + type='opencompass.datasets.LongBenchRetrievalEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + '以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是"段落1","段落2"等格式\n\n答案是:', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_retrieval_zh', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_retrieval_zhDataset'), + dict( + abbr='LongBench_passage_count_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchCountEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=32, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='passage_count', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchpassage_countDataset'), + dict( + abbr='LongBench_trec_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.trec_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='trec', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchtrecDataset'), + dict( + abbr='LongBench_lsht_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchClassificationEvaluator' + ), + pred_postprocessor=dict( + type='opencompass.datasets.lsht_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=64, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt='请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='lsht', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='all_labels', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchlshtDataset'), + dict( + abbr='LongBench_multi_news_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=512, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='multi_news', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchmulti_newsDataset'), + dict( + abbr='LongBench_samsum_6', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LongBenchRougeEvaluator'), + pred_postprocessor=dict( + type='opencompass.datasets.samsum_postprocess'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=128, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict(round=[ + dict( + prompt= + 'Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}', + role='HUMAN'), + ]), + type= + 'opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + name='samsum', + path='opencompass/Longbench', + reader_cfg=dict( + input_columns=[ + 'context', + 'input', + ], + output_column='answers', + test_range='[150:175]', + test_split='test', + train_split='test'), + type='opencompass.datasets.LongBenchsamsumDataset'), + ], +] +models = [ + dict( + abbr='mask_gdn-1.3B', + batch_padding=False, + batch_size=16, + max_out_len=100, + max_seq_len=16384, + path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + run_cfg=dict(num_gpus=1), + tokenizer_path='/mnt/jfzn/msj/train_exp/mask_gdn_1B_hrr-rank4', + type='opencompass.models.HuggingFaceCausalLM'), +] +work_dir = 'outputs/default/20251127_164744' diff --git a/tools/collect_code_preds.py b/tools/collect_code_preds.py new file mode 100644 index 0000000000000000000000000000000000000000..91fea3e775cedeea98395866878428004ef018b9 --- /dev/null +++ b/tools/collect_code_preds.py @@ -0,0 +1,209 @@ +import argparse +import json +import os +import os.path as osp +import re + +import mmengine +from mmengine import Config +from mmengine.utils import mkdir_or_exist + +from opencompass.datasets.humanevalx import _clean_up_code +from opencompass.utils import (dataset_abbr_from_cfg, get_infer_output_path, + get_logger, model_abbr_from_cfg) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Collect Humanevalx dataset predictions.') + parser.add_argument('config', help='Config file path') + parser.add_argument('-r', + '--reuse', + nargs='?', + type=str, + const='latest', + help='Reuse previous outputs & results, and run any ' + 'missing jobs presented in the config. If its ' + 'argument is not specified, the latest results in ' + 'the work_dir will be reused. The argument should ' + 'also be a specific timestamp, e.g. 20230516_144254'), + args = parser.parse_args() + return args + + +_LANGUAGE_NAME_DICT = { + 'cpp': 'CPP', + 'go': 'Go', + 'java': 'Java', + 'js': 'JavaScript', + 'python': 'Python', + 'rust': 'Rust', +} +FAILED = 0 +SUCCEED = 1 + + +def gpt_python_postprocess(ori_prompt: str, text: str) -> str: + """Better answer postprocessor for better instruction-aligned models like + GPT.""" + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # in case starting with ```python + text = text[max(text.find('\n') + 1, 0):] + + match_ori = re.search(r'def(.*?)\(', ori_prompt) + match = re.search(r'def(.*?)\(', text) + if match: + if match.group() == match_ori.group(): + text = re.sub('def(.*?)\n', '', text, count=1) + + for c_index, c in enumerate(text[:5]): + if c != ' ': + text = ' ' * (4 - c_index) + text + break + + text = text.split('\n\n\n')[0] + return text + + +def wizardcoder_postprocess(text: str) -> str: + """Postprocess for WizardCoder Models.""" + if '```' in text: + blocks = re.findall(r'```(.*?)```', text, re.DOTALL) + if len(blocks) == 0: + text = text.split('```')[1] # fall back to default strategy + else: + text = blocks[0] # fetch the first code block + if not text.startswith('\n'): # in case starting with ```python + text = text[max(text.find('\n') + 1, 0):] + else: + match = re.search(r'Here(.*?)\n', text) + if match: + text = re.sub('Here(.*?)\n', '', text, count=1) + + return text + + +def collect_preds(filename: str): + # in case the prediction is partial + root, ext = osp.splitext(filename) + partial_filename = root + '_0' + ext + # collect all the prediction results + if not osp.exists(osp.realpath(filename)) and not osp.exists( + osp.realpath(partial_filename)): + print(f'No predictions found for {filename}') + return FAILED, None, None + else: + if osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + pred_strs = [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + ori_prompt_strs = [ + preds[str(i)]['origin_prompt'] for i in range(len(preds)) + ] + else: + filename = partial_filename + pred_strs = [] + ori_prompt_strs = [] + i = 1 + while osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + filename = root + f'_{i}' + ext + i += 1 + pred_strs += [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + ori_prompt_strs += [ + preds[str(i)]['origin_prompt'] for i in range(len(preds)) + ] + return SUCCEED, ori_prompt_strs, pred_strs + + +def main(): + args = parse_args() + # initialize logger + logger = get_logger(log_level='INFO') + cfg = Config.fromfile(args.config) + cfg.setdefault('work_dir', './outputs/default/') + + assert args.reuse, 'Please provide the experienment work dir.' + if args.reuse: + if args.reuse == 'latest': + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + logger.warning('No previous results to reuse!') + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] + else: + dir_time_str = args.reuse + logger.info(f'Reusing experiements from {dir_time_str}') + # update "actual" work_dir + cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str) + + for model in cfg.models: + model_abbr = model_abbr_from_cfg(model) + for dataset in cfg.datasets: + dataset_abbr = dataset_abbr_from_cfg(dataset) + filename = get_infer_output_path( + model, dataset, osp.join(cfg.work_dir, 'predictions')) + + succeed, ori_prompt_strs, pred_strs = collect_preds(filename) + if not succeed: + continue + + # infer the language type + for k, v in _LANGUAGE_NAME_DICT.items(): + if k in dataset_abbr: + lang = k + task = v + break + + # special postprocess for GPT + if model_abbr in [ + 'WizardCoder-1B-V1.0', + 'WizardCoder-3B-V1.0', + 'WizardCoder-15B-V1.0', + 'WizardCoder-Python-13B-V1.0', + 'WizardCoder-Python-34B-V1.0', + ]: + predictions = [{ + 'task_id': f'{task}/{i}', + 'generation': wizardcoder_postprocess(pred), + } for i, pred in enumerate(pred_strs)] + elif 'CodeLlama' not in model_abbr and lang == 'python': + predictions = [{ + 'task_id': + f'{task}/{i}', + 'generation': + gpt_python_postprocess(ori_prompt, pred), + } for i, (ori_prompt, + pred) in enumerate(zip(ori_prompt_strs, pred_strs))] + else: + predictions = [{ + 'task_id': f'{task}/{i}', + 'generation': _clean_up_code(pred, lang), + } for i, pred in enumerate(pred_strs)] + + # save processed results if not exists + result_file_path = os.path.join(cfg['work_dir'], 'humanevalx', + model_abbr, + f'humanevalx_{lang}.json') + if osp.exists(result_file_path): + logger.info( + f'File exists for {model_abbr}, skip copy from predictions.' # noqa + ) + else: + mkdir_or_exist(osp.split(result_file_path)[0]) + with open(result_file_path, 'w') as f: + for pred in predictions: + f.write(json.dumps(pred) + '\n') + + +if __name__ == '__main__': + main() diff --git a/tools/compare_configs.py b/tools/compare_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..480e0307eaa23423f1b4b938a589a302925e0867 --- /dev/null +++ b/tools/compare_configs.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +import argparse +import filecmp +import os + +from mmengine.logging import MMLogger + + +def get_files(folder, extensions, ignore_folder=[]): + """Get all file paths in the folder with specified extensions.""" + files = [] + for root, dirs, files_in_dir in os.walk(folder): + for file in files_in_dir: + if any(file.endswith(ext) for ext in extensions): + files.append(os.path.relpath(os.path.join(root, file), folder)) + ignore_folders = [] + for folder in ignore_folder: + ignore_folders.append(os.path.relpath(folder)) + # ignore the files starting with the folder in ignore_folder + ignore_files = [] + for file in files: + for folder in ignore_folders: + if file.startswith(folder): + ignore_files.append(file) + # files.remove(file) + keep_files = [] + for file in files: + if file not in ignore_files: + keep_files.append(file) + + return keep_files + + +def compare_folders(folder1, folder2, extensions, ignore_folder): + """Compare files with specified extensions in two folders.""" + logger = MMLogger.get_current_instance() + files1 = set(get_files(folder1, extensions, ignore_folder)) + files2 = set(get_files(folder2, extensions)) + + # Check for files that are only in one folder + only_in_folder1 = files1 - files2 + only_in_folder2 = files2 - files1 + common_files = files1 & files2 + + if only_in_folder1: + message = f'Only in {folder1}: {only_in_folder1}, '\ + 'please copy files into {folder2}' + raise ValueError(message) + if only_in_folder2: + print(f'Only in {folder2}: {only_in_folder2}') + + # Compare the content of common files + for file in common_files: + file1 = os.path.join(folder1, file) + file2 = os.path.join(folder2, file) + if not filecmp.cmp(file1, file2, shallow=False): + logger.warning(f'Files differ: {file1} and {file2}') + raise ValueError(f'Files differ: {file1} and {file2}') + else: + pass + # logger.info(f"Files are the same: {file1} and {file2}") + + +def main(): + parser = argparse.ArgumentParser( + description='Compare specified files in two folders') + parser.add_argument('folder1', help='Path to the first folder') + parser.add_argument('folder2', help='Path to the second folder') + parser.add_argument( + '--extensions', + nargs='+', + default=['.py', '.json', '.md', '.yml', '.txt'], + help='File extensions to compare (default: .py .json .md .yml .txt)') + parser.add_argument('--ignore', + nargs='+', + default=[], + help='Folder of ignored case') + args = parser.parse_args() + + compare_folders(args.folder1, args.folder2, args.extensions, args.ignore) + + +if __name__ == '__main__': + main() diff --git a/tools/convert_alignmentbench.py b/tools/convert_alignmentbench.py new file mode 100644 index 0000000000000000000000000000000000000000..8f9699f9c4d69b555aa0799795ac10cb494d463b --- /dev/null +++ b/tools/convert_alignmentbench.py @@ -0,0 +1,94 @@ +import argparse +import csv +import json +import os +from glob import glob + +from tqdm import tqdm + + +def extract_predictions_from_json(input_folder): + + sub_folder = os.path.join(input_folder, 'submission') + pred_folder = os.path.join(input_folder, 'predictions') + if not os.path.exists(sub_folder): + os.makedirs(sub_folder) + + for model_name in os.listdir(pred_folder): + model_folder = os.path.join(pred_folder, model_name) + try: + # when use split + json_paths = glob( + os.path.join(model_folder, 'alignment_bench_*.json')) + # sorted by index + json_paths = sorted( + json_paths, + key=lambda x: int(x.split('.json')[0].split('_')[-1])) + except Exception as e: + # when only one complete file + print(e) + json_paths = [os.path.join(model_folder, 'alignment_bench.json')] + + all_predictions = [] + for json_ in json_paths: + json_data = json.load(open(json_)) + for _, value in json_data.items(): + prediction = value['prediction'] + all_predictions.append(prediction) + + # for prediction + output_path = os.path.join(sub_folder, model_name + '_submission.csv') + with open(output_path, 'w', encoding='utf-8-sig') as file: + writer = csv.writer(file) + for ans in tqdm(all_predictions): + writer.writerow([str(ans)]) + print('Saved {} for submission'.format(output_path)) + + +def process_jsonl(file_path): + new_data = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + json_data = json.loads(line) + new_dict = { + 'question': json_data['question'], + 'capability': json_data['category'], + 'others': { + 'subcategory': json_data['subcategory'], + 'reference': json_data['reference'], + 'question_id': json_data['question_id'] + } + } + new_data.append(new_dict) + return new_data + + +def save_as_json(data, output_file='./alignment_bench.json'): + with open(output_file, 'w', encoding='utf-8') as file: + json.dump(data, file, indent=4, ensure_ascii=False) + + +def parse_args(): + parser = argparse.ArgumentParser(description='File Converter') + parser.add_argument('--mode', + default='json', + help='The mode of convert to json or convert to csv') + parser.add_argument('--jsonl', + default='./data_release.jsonl', + help='The original jsonl path') + parser.add_argument('--json', + default='./alignment_bench.json', + help='The results json path') + parser.add_argument('--exp-folder', help='The results json name') + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + mode = args.mode + if mode == 'json': + processed_data = process_jsonl(args.jsonl) + save_as_json(processed_data, args.json) + elif mode == 'csv': + extract_predictions_from_json(args.exp_folder) diff --git a/tools/list_configs.py b/tools/list_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..9aa6cc490e35991223da2705617d22ac79b5b264 --- /dev/null +++ b/tools/list_configs.py @@ -0,0 +1,37 @@ +import argparse + +import tabulate + +from opencompass.utils import match_files + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Utils to list available models and datasets.') + parser.add_argument('pattern', + nargs='*', + default='*', + type=str, + help='Patterns, ' + 'wildcard matching supported.') + return parser.parse_args() + + +def main(): + args = parse_args() + models = match_files('opencompass/configs/models/', + args.pattern, + fuzzy=True) + if models: + table = [['Model', 'Config Path'], *models] + print(tabulate.tabulate(table, headers='firstrow', tablefmt='psql')) + datasets = match_files('opencompass/configs/datasets/', + args.pattern, + fuzzy=True) + if datasets: + table = [['Dataset', 'Config Path'], *datasets] + print(tabulate.tabulate(table, headers='firstrow', tablefmt='psql')) + + +if __name__ == '__main__': + main() diff --git a/tools/prediction_merger.py b/tools/prediction_merger.py new file mode 100644 index 0000000000000000000000000000000000000000..2614686d00a801ecc1b224b66082e778f26bdcb6 --- /dev/null +++ b/tools/prediction_merger.py @@ -0,0 +1,116 @@ +import argparse +import copy +import json +import os + +import mmengine +from mmengine.config import Config, ConfigDict + +from opencompass.utils import build_dataset_from_cfg, get_infer_output_path + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Merge patitioned predictions') + parser.add_argument('config', help='Train config file path') + parser.add_argument('-w', '--work-dir', default=None, type=str) + parser.add_argument('-r', '--reuse', default='latest', type=str) + parser.add_argument('-c', '--clean', action='store_true') + parser.add_argument('-f', '--force', action='store_true') + args = parser.parse_args() + return args + + +class PredictionMerger: + + def __init__(self, cfg: ConfigDict) -> None: + self.cfg = cfg + self.model_cfg = copy.deepcopy(self.cfg['model']) + self.dataset_cfg = copy.deepcopy(self.cfg['dataset']) + self.work_dir = self.cfg.get('work_dir') + + def run(self): + filename = get_infer_output_path( + self.model_cfg, self.dataset_cfg, + os.path.join(self.work_dir, 'predictions')) + root, ext = os.path.splitext(filename) + partial_filename = root + '_0' + ext + + if os.path.exists( + os.path.realpath(filename)) and not self.cfg['force']: + return + + if not os.path.exists(os.path.realpath(partial_filename)): + print(f'{filename} not found') + return + + # Load predictions + partial_filenames = [] + preds, offset = {}, 0 + i = 1 + while os.path.exists(os.path.realpath(partial_filename)): + partial_filenames.append(os.path.realpath(partial_filename)) + _preds = mmengine.load(partial_filename) + partial_filename = root + f'_{i}' + ext + i += 1 + for _o in range(len(_preds)): + preds[str(offset)] = _preds[str(_o)] + offset += 1 + + dataset = build_dataset_from_cfg(self.dataset_cfg) + if len(preds) != len(dataset.test): + print('length mismatch') + return + + print(f'Merge {partial_filenames} to {filename}') + with open(filename, 'w', encoding='utf-8') as f: + json.dump(preds, f, indent=4, ensure_ascii=False) + + if self.cfg['clean']: + for partial_filename in partial_filenames: + print(f'Remove {partial_filename}') + os.remove(partial_filename) + + +def dispatch_tasks(cfg): + for model in cfg['models']: + for dataset in cfg['datasets']: + PredictionMerger({ + 'model': model, + 'dataset': dataset, + 'work_dir': cfg['work_dir'], + 'clean': cfg['clean'], + 'force': cfg['force'], + }).run() + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + # set work_dir + if args.work_dir is not None: + cfg['work_dir'] = args.work_dir + else: + cfg.setdefault('work_dir', './outputs/default') + + if args.reuse: + if args.reuse == 'latest': + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + print('No previous results to reuse!') + return + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] + else: + dir_time_str = args.reuse + cfg['work_dir'] = os.path.join(cfg.work_dir, dir_time_str) + + cfg['clean'] = args.clean + cfg['force'] = args.force + + dispatch_tasks(cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/prompt_viewer.py b/tools/prompt_viewer.py new file mode 100644 index 0000000000000000000000000000000000000000..ed7c0c96f7ed30d6b83a79f9ddf2863b0c5c2bc7 --- /dev/null +++ b/tools/prompt_viewer.py @@ -0,0 +1,227 @@ +import argparse +import fnmatch +from typing import Dict + +from mmengine.config import Config, ConfigDict + +from opencompass.openicl.icl_inferencer import (AgentInferencer, + ChatInferencer, CLPInferencer, + GenInferencer, LLInferencer, + PPLInferencer, + PPLOnlyInferencer) +from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS +from opencompass.utils import (Menu, build_dataset_from_cfg, + build_model_from_cfg, dataset_abbr_from_cfg, + model_abbr_from_cfg) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='View generated prompts based on datasets (and models)') + parser.add_argument('config', help='Train config file path') + parser.add_argument('-n', '--non-interactive', action='store_true') + parser.add_argument('-a', '--all', action='store_true') + parser.add_argument('-p', + '--pattern', + type=str, + help='To match the dataset abbr.') + parser.add_argument('-c', + '--count', + type=int, + default=1, + help='Number of prompts to print') + args = parser.parse_args() + return args + + +def parse_model_cfg(model_cfg: ConfigDict) -> Dict[str, ConfigDict]: + model2cfg = {} + for model in model_cfg: + model2cfg[model_abbr_from_cfg(model)] = model + return model2cfg + + +def parse_dataset_cfg(dataset_cfg: ConfigDict) -> Dict[str, ConfigDict]: + dataset2cfg = {} + for dataset in dataset_cfg: + dataset2cfg[dataset_abbr_from_cfg(dataset)] = dataset + return dataset2cfg + + +def print_prompts(model_cfg, dataset_cfg, count=1): + # TODO: A really dirty method that copies code from PPLInferencer and + # GenInferencer. In the future, the prompt extraction code should be + # extracted and generalized as a static method in these Inferencers + # and reused here. + if model_cfg: + max_seq_len = model_cfg.get('max_seq_len', 32768) + if not model_cfg['type'].is_api: + model_cfg['tokenizer_only'] = True + model = build_model_from_cfg(model_cfg) + else: + max_seq_len = None + model = None + + infer_cfg = dataset_cfg.get('infer_cfg') + + dataset = build_dataset_from_cfg(dataset_cfg) + + ice_template = None + if hasattr(infer_cfg, 'ice_template'): + ice_template = ICL_PROMPT_TEMPLATES.build(infer_cfg['ice_template']) + + prompt_template = None + if hasattr(infer_cfg, 'prompt_template'): + prompt_template = ICL_PROMPT_TEMPLATES.build( + infer_cfg['prompt_template']) + + infer_cfg['retriever']['dataset'] = dataset + retriever = ICL_RETRIEVERS.build(infer_cfg['retriever']) + + ice_idx_list = retriever.retrieve() + + supported_inferencer = [ + AgentInferencer, PPLInferencer, GenInferencer, CLPInferencer, + PPLOnlyInferencer, ChatInferencer, LLInferencer + ] + if infer_cfg.inferencer.type not in supported_inferencer: + print(f'Only {supported_inferencer} are supported') + return + + for idx in range(min(count, len(ice_idx_list))): + if issubclass(infer_cfg.inferencer.type, + (PPLInferencer, LLInferencer)): + labels = retriever.get_labels(ice_template=ice_template, + prompt_template=prompt_template) + ice = retriever.generate_ice(ice_idx_list[idx], + ice_template=ice_template) + print('-' * 100) + print('ICE Template:') + print('-' * 100) + print(ice) + print('-' * 100) + for label in labels: + prompt = retriever.generate_label_prompt( + idx, + ice, + label, + ice_template=ice_template, + prompt_template=prompt_template, + remain_sep=None) + if max_seq_len is not None: + prompt_token_num = model.get_token_len_from_template( + prompt) + while len(ice_idx_list[idx] + ) > 0 and prompt_token_num > max_seq_len: + num_ice = len(ice_idx_list[idx]) + print(f'Truncating ice {num_ice} -> {num_ice - 1}', + f'Number of tokens: {prompt_token_num} -> ...') + ice_idx_list[idx] = ice_idx_list[idx][:-1] + ice = retriever.generate_ice(ice_idx_list[idx], + ice_template=ice_template) + prompt = retriever.generate_label_prompt( + idx, + ice, + label, + ice_template=ice_template, + prompt_template=prompt_template) + prompt_token_num = model.get_token_len_from_template( + prompt) + print(f'Number of tokens: {prompt_token_num}') + if model is not None: + prompt = model.parse_template(prompt, mode='ppl') + print('-' * 100) + print(f'Label: {label}') + print('Sample prompt:') + print('-' * 100) + print(prompt) + print('-' * 100) + else: + ice_idx = ice_idx_list[idx] + ice = retriever.generate_ice(ice_idx, ice_template=ice_template) + prompt = retriever.generate_prompt_for_generate_task( + idx, + ice, + gen_field_replace_token=infer_cfg.inferencer.get( + 'gen_field_replace_token', ''), + ice_template=ice_template, + prompt_template=prompt_template) + if max_seq_len is not None: + prompt_token_num = model.get_token_len_from_template(prompt) + while len(ice_idx) > 0 and prompt_token_num > max_seq_len: + num_ice = len(ice_idx) + print(f'Truncating ice {num_ice} -> {num_ice - 1}', + f'Number of tokens: {prompt_token_num} -> ...') + ice_idx = ice_idx[:-1] + ice = retriever.generate_ice(ice_idx, + ice_template=ice_template) + prompt = retriever.generate_prompt_for_generate_task( + idx, + ice, + gen_field_replace_token=infer_cfg.inferencer.get( + 'gen_field_replace_token', ''), + ice_template=ice_template, + prompt_template=prompt_template) + prompt_token_num = model.get_token_len_from_template( + prompt) + print(f'Number of tokens: {prompt_token_num}') + if model is not None: + prompt = model.parse_template(prompt, mode='gen') + print('-' * 100) + print('Sample prompt:') + print('-' * 100) + print(prompt) + print('-' * 100) + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + # cfg.models = + model2cfg = parse_model_cfg(cfg.models) if 'models' in cfg else { + 'None': None + } + if 'datasets' in cfg: + dataset2cfg = parse_dataset_cfg(cfg.datasets) + else: + dataset2cfg = {} + for key in cfg.keys(): + if key.endswith('_datasets'): + dataset2cfg.update(parse_dataset_cfg(cfg[key])) + + if args.pattern is not None: + matches = fnmatch.filter(dataset2cfg, args.pattern) + if len(matches) == 0: + raise ValueError( + 'No dataset match the pattern. Please select from: \n' + + '\n'.join(dataset2cfg.keys())) + dataset2cfg = {k: dataset2cfg[k] for k in matches} + + if not args.all: + if not args.non_interactive: + model, dataset = Menu( + [list(model2cfg.keys()), + list(dataset2cfg.keys())], [ + f'Please make a selection of {s}:' + for s in ['model', 'dataset'] + ]).run() + else: + model = list(model2cfg.keys())[0] + dataset = list(dataset2cfg.keys())[0] + model_cfg = model2cfg[model] + dataset_cfg = dataset2cfg[dataset] + print_prompts(model_cfg, dataset_cfg, args.count) + else: + for model_abbr, model_cfg in model2cfg.items(): + for dataset_abbr, dataset_cfg in dataset2cfg.items(): + print('=' * 64, '[BEGIN]', '=' * 64) + print(f'[MODEL]: {model_abbr}') + print(f'[DATASET]: {dataset_abbr}') + print('---') + print_prompts(model_cfg, dataset_cfg, args.count) + print('=' * 65, '[END]', '=' * 65) + print() + + +if __name__ == '__main__': + main() diff --git a/tools/test_api_model.py b/tools/test_api_model.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb03f7e8b25957a5d7ac47e622cf2ba1f32349a --- /dev/null +++ b/tools/test_api_model.py @@ -0,0 +1,206 @@ +import argparse +from typing import Dict + +from mmengine.config import Config, ConfigDict + +from opencompass.utils import Menu, build_model_from_cfg, model_abbr_from_cfg +from opencompass.utils.prompt import PromptList + +test_prompts = [ + PromptList([ + { + 'section': 'begin', + 'pos': 'begin' + }, + { + 'role': + 'SYSTEM', + 'fallback_role': + 'HUMAN', + 'prompt': + 'The following are multiple choice questions (with answers) about professional law.' # noqa + }, + '', + { + 'section': 'ice', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + "Without a warrant, police officers searched the garbage cans in the alley behind a man's house and discovered chemicals used to make methamphetamine, as well as cooking utensils and containers with the man's fingerprints on them. The alley was a public thoroughfare maintained by the city, and the garbage was picked up once a week by a private sanitation company. The items were found inside the garbage cans in plastic bags that had been tied closed and further secured with tape. The man was charged in federal court with the manufacture of methamphetamine. Did the search of the garbage cans violate the Fourth Amendment?\nA. No, because the man had no reasonable expectation of privacy in garbage left in the alley.\nB. No, because the probative value of the evidence outweighs the man's modest privacy claims in his garbage.\nC. Yes, because the alley was within the curtilage of the man's home and entry without a warrant was unconstitutional.\nD. Yes, because there is a reasonable expectation of privacy in one's secured garbage containers.\nAnswer: " # noqa + }, + { + 'role': 'BOT', + 'prompt': 'A\n' + }, + { + 'section': 'ice', + 'pos': 'end' + }, + { + 'section': 'ice', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + 'A man borrowed $500,000 from a bank, securing the loan with a mortgage on a commercial building he owned. The mortgage provided as follows: "No prepayment may be made on this loan during the first two years after the date of this mortgage. Thereafter, prepayment may be made in any amount at any time but only if accompanied by a prepayment fee of 5% of the amount prepaid." One year later, the man received an unexpected cash gift of $1 million and wished to pay off the $495,000 principal balance still owed on the loan. $495,000 principal balance still owed on the loan. Concerned that the bank might refuse prepayment, despite a rise in market interest rates in the year since the loan was made, or at least insist on the 5% prepayment fee, the man consulted an attorney concerning the enforceability of the above-quoted clause. There is no applicable statute. What is the attorney likely to say? \nA. The entire clause is unenforceable, because it violates a public policy favoring the prompt and early repayment of debt.\nB. The entire clause is unenforceable, because the rise in interest rates will allow the bank to reloan the funds without loss.\nC. The two-year prepayment prohibition and the prepayment fee provision are both valid and enforceable.\nD. The two-year prepayment prohibition is unenforceable, but the prepayment fee provision is enforceable.\nAnswer: ' # noqa + }, + { + 'role': 'BOT', + 'prompt': 'D\n' + }, + { + 'section': 'ice', + 'pos': 'end' + }, + { + 'section': 'ice', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + "A woman and a defendant entered into an arrangement where the woman promised to pay the defendant $10,000 to act as a surrogate mother. In return, the defendant agreed to be implanted with the woman's embryo and carry the baby to term. The woman paid the defendant the $10,000 upfront. During the seventh month of the pregnancy, the defendant changed her mind and decided to keep the child herself. The defendant moved out of state and gave birth to the baby, which she refuses to turn over to the woman. The defendant is guilty of\nA. no crime.\nB. embezzlement.\nC. kidnapping.\nD. false pretenses.\nAnswer: " # noqa + }, + { + 'role': 'BOT', + 'prompt': 'A\n' + }, + { + 'section': 'ice', + 'pos': 'end' + }, + { + 'section': 'ice', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + "A rescuer was driving on an isolated portion of a country road. His headlights caught a figure lying at the side of the road. The rescuer stopped to investigate and found a victim, who was bleeding from head wounds and appeared to have been severely beaten. The rescuer then lifted the victim into his car and drove her to the hospital, a half-hour trip. When they arrived at the hospital, the rescuer carried the victim into the emergency room. He left her with a nurse and then returned home. Although the victim recovered from her injuries, she sued the hospital for malpractice, claiming that she was not promptly given medical attention. At trial, the nurse proposes to testify that when the victim was first brought to the hospital, she was unconscious. The victim's attorney objects and moves to strike the nurse's testimony. The trial judge should\nA. sustain the objection, because it goes to an ultimate issue in the case. \nB. sustain the objection, because the nurse is not qualified to render an expert opinion. \nC. overrule the objection, because it is a shorthand rendition of what she observed. \nD. overrule the objection, because there are independent grounds to show a present sense impression. \nAnswer: " # noqa + }, + { + 'role': 'BOT', + 'prompt': 'C\n' + }, + { + 'section': 'ice', + 'pos': 'end' + }, + { + 'section': 'ice', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + "A young woman who attended a rock concert at a nightclub was injured when the band opened its performance with illegal fireworks that ignited foam insulation in the club's ceiling and walls. The young woman sued the radio station that sponsored the performance. The radio station has moved for summary judgment, claiming that it owed no duty to audience members. The evidence has established the following facts: The station advertised its sponsorship on the radio and in print, distributed free tickets to the concert, and in print, distributed free tickets to the concert, staffed the event with the station's interns to assist with crowd control, and provided a station disc jockey to serve as master of ceremonies. The master of ceremonies had the authority to stop or delay the performance at any time on the basis of any safety concern. The station knew or should have known that the band routinely used unlicensed, illegal fireworks in its performances. Should the court grant the radio station's motion for summary judgment? \nA. No, because there is sufficient evidence of knowledge and control on the part of the station to impose on it a duty of care to audience members.\nB. No, because under respondeat superior, the radio station is vicariously liable for the negligent actions of the band.\nC. Yes, because it is the band and the nightclub owners who owed audience members a duty of care.\nD. Yes, because the conduct of the band in setting off illegal fireworks was criminal and setting off illegal fireworks was criminal and was a superseding cause as a matter of law.\nAnswer: " # noqa + }, + { + 'role': 'BOT', + 'prompt': 'A\n' + }, + { + 'section': 'ice', + 'pos': 'end' + }, + '\n', + '', + { + 'section': 'begin', + 'pos': 'end' + }, + { + 'section': 'round', + 'pos': 'begin' + }, + { + 'role': + 'HUMAN', + 'prompt': + 'A state statute provides: "Whenever a person knows or should know that he (or she) is being arrested by a police officer, it is the duty of such person to refrain from using force or any weapon in resisting arrest. " Violation of the statute is made punishable by fine and/or imprisonment. One morning, there was a bank robbery in the state. That afternoon, a police officer arrested a suspect who he believed was involved in the crime. However, the police officer and the suspect have given different accounts concerning what happened next. According to the police officer, after the suspect was apprehended, he resisted arrest and hit the police officer in the mouth with his fist. The police officer, who was momentarily stunned, pulled out his nightstick and struck the suspect over the head with it. On the other hand, the suspect claimed that after he was arrested, he cursed at the policeman, whereupon the police officer began hitting the suspect with his nightstick. To avoid being hit again, the suspect hit the police officer with his fist, knocking him down. The suspect was charged with assault. The suspect should be found\nA. not guilty, if the arrest was unlawful without probable cause and the jury believes the suspect\'s account.\nB. not guilty, if the arrest was lawful, provided that the jury believes the suspect\'s account.\nC. guilty, if the arrest was lawful, regardless which account the jury believes.\nD. guilty, if the arrest was unlawful, regardless which account the jury believes.\nAnswer: ' # noqa + }, + { + 'section': 'round', + 'pos': 'end' + } + ]), + 'Hello! How are you?' +] + +meta_templates = [ + None, + dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True) + ], ), + dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True) + ], + reserved_roles=[ + dict(role='SYSTEM', api_role='SYSTEM'), + ], + ) +] + + +def test_model(model_cfg: ConfigDict): + for meta_template in meta_templates: + print('Testing meta_template: ', meta_template) + model_cfg['meta_template'] = meta_template + model = build_model_from_cfg(model_cfg) + print('Prompt 0 length:', + model.get_token_len_from_template(test_prompts[0])) + print('Prompt 1 length:', + model.get_token_len_from_template(test_prompts[1])) + print('Prompt lengths: ', + model.get_token_len_from_template(test_prompts)) + msgs = model.generate_from_template(test_prompts, max_out_len=100) + print('Prompt 0 response:', msgs[0]) + print('Prompt 1 response:', msgs[1]) + print('-' * 100) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Test if a given API model wrapper works properly') + parser.add_argument('config', help='Train config file path') + parser.add_argument('-n', '--non-interactive', action='store_true') + args = parser.parse_args() + return args + + +def parse_model_cfg(model_cfg: ConfigDict) -> Dict[str, ConfigDict]: + model2cfg = {} + for model in model_cfg: + model2cfg[model_abbr_from_cfg(model)] = model + return model2cfg + + +def main(): + args = parse_args() + cfg = Config.fromfile(args.config) + if 'models' not in cfg: + raise ValueError('No "models" specified in config file!') + model2cfg = parse_model_cfg(cfg.models) + + if not args.non_interactive and len(model2cfg) > 1: + model = Menu([list(model2cfg.keys())], + ['Please make a selection of models:']).run() + else: + model = list(model2cfg.keys())[0] + model_cfg = model2cfg[model] + test_model(model_cfg) + + +if __name__ == '__main__': + main() diff --git a/tools/update_dataset_suffix.py b/tools/update_dataset_suffix.py new file mode 100644 index 0000000000000000000000000000000000000000..bcecf8ef253a5261565d3f78f0b90e31dc8e56fd --- /dev/null +++ b/tools/update_dataset_suffix.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +import argparse +import glob +import hashlib +import json +import os +import re +from multiprocessing import Pool +from typing import List, Union + +from mmengine.config import Config, ConfigDict + + +# from opencompass.utils import get_prompt_hash +# copied from opencompass.utils.get_prompt_hash, for easy use in ci +def get_prompt_hash(dataset_cfg: Union[ConfigDict, List[ConfigDict]]) -> str: + """Get the hash of the prompt configuration. + + Args: + dataset_cfg (ConfigDict or list[ConfigDict]): The dataset + configuration. + + Returns: + str: The hash of the prompt configuration. + """ + if isinstance(dataset_cfg, list): + if len(dataset_cfg) == 1: + dataset_cfg = dataset_cfg[0] + else: + hashes = ','.join([get_prompt_hash(cfg) for cfg in dataset_cfg]) + hash_object = hashlib.sha256(hashes.encode()) + return hash_object.hexdigest() + # for custom datasets + if 'infer_cfg' not in dataset_cfg: + dataset_cfg.pop('abbr', '') + dataset_cfg.pop('path', '') + d_json = json.dumps(dataset_cfg.to_dict(), sort_keys=True) + hash_object = hashlib.sha256(d_json.encode()) + return hash_object.hexdigest() + # for regular datasets + if 'reader_cfg' in dataset_cfg.infer_cfg: + # new config + reader_cfg = dict(type='DatasetReader', + input_columns=dataset_cfg.reader_cfg.input_columns, + output_column=dataset_cfg.reader_cfg.output_column) + dataset_cfg.infer_cfg.reader = reader_cfg + if 'train_split' in dataset_cfg.infer_cfg.reader_cfg: + dataset_cfg.infer_cfg.retriever[ + 'index_split'] = dataset_cfg.infer_cfg['reader_cfg'][ + 'train_split'] + if 'test_split' in dataset_cfg.infer_cfg.reader_cfg: + dataset_cfg.infer_cfg.retriever[ + 'test_split'] = dataset_cfg.infer_cfg.reader_cfg.test_split + for k, v in dataset_cfg.infer_cfg.items(): + dataset_cfg.infer_cfg[k]['type'] = v['type'].split('.')[-1] + # A compromise for the hash consistency + if 'fix_id_list' in dataset_cfg.infer_cfg.retriever: + fix_id_list = dataset_cfg.infer_cfg.retriever.pop('fix_id_list') + dataset_cfg.infer_cfg.inferencer['fix_id_list'] = fix_id_list + d_json = json.dumps(dataset_cfg.infer_cfg.to_dict(), sort_keys=True) + hash_object = hashlib.sha256(d_json.encode()) + return hash_object.hexdigest() + + +# Assuming get_hash is a function that computes the hash of a file +# from get_hash import get_hash +def get_hash(path): + cfg = Config.fromfile(path) + for k in cfg.keys(): + if k.endswith('_datasets'): + return get_prompt_hash(cfg[k])[:6] + print(f'Could not find *_datasets in {path}') + return None + + +def check_and_rename(filepath): + base_name = os.path.basename(filepath) + match = re.match(r'(.*)_(gen|ppl|ll|mixed)_(.*).py', base_name) + if match: + dataset, mode, old_hash = match.groups() + try: + new_hash = get_hash(filepath) + except Exception: + print(f'Failed to get hash for {filepath}') + raise ModuleNotFoundError + + if not new_hash: + return None, None + if old_hash != new_hash: + new_name = f'{dataset}_{mode}_{new_hash}.py' + new_file = os.path.join(os.path.dirname(filepath), new_name) + print(f'Rename {filepath} to {new_file}') + return filepath, new_file + return None, None + + +# def update_imports(data): +# python_file, name_pairs = data +# for filepath, new_file in name_pairs: +# old_name = os.path.basename(filepath)[:-3] +# new_name = os.path.basename(new_file)[:-3] +# if not os.path.exists(python_file): +# return +# with open(python_file, 'r') as file: +# filedata = file.read() +# # Replace the old name with new name +# new_data = filedata.replace(old_name, new_name) +# if filedata != new_data: +# with open(python_file, 'w') as file: +# file.write(new_data) +# # print(f"Updated imports in {python_file}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('python_files', nargs='*') + # Could be opencompass/configs/datasets and configs/datasets + parser.add_argument('--root_folder', default='configs/datasets') + args = parser.parse_args() + + root_folder = args.root_folder + if args.python_files: + python_files = [ + i for i in args.python_files if i.startswith(root_folder) + ] + else: + python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True) + + # Use multiprocessing to speed up the check and rename process + with Pool(16) as p: + name_pairs = p.map(check_and_rename, python_files) + name_pairs = [pair for pair in name_pairs if pair[0] is not None] + if not name_pairs: + return + with Pool(16) as p: + p.starmap(os.rename, name_pairs) + # root_folder = 'configs' + # python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True) + # update_data = [(python_file, name_pairs) for python_file in python_files] + # with Pool(16) as p: + # p.map(update_imports, update_data) + + +if __name__ == '__main__': + main() diff --git a/tools/viz_multi_model.py b/tools/viz_multi_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e1421bdfc8331aacf057dfabe884fa1925dc0c14 --- /dev/null +++ b/tools/viz_multi_model.py @@ -0,0 +1,50 @@ +from pathlib import Path +from typing import List + +import typer +from mmengine.config import Config +from typer import Option + +from opencompass.registry import build_from_cfg +from opencompass.summarizers.multi_model import MultiModelSummarizer + +app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False) + + +@app.command(help='Visualize the results of multiple models') +def main( + cfg_paths: List[Path] = Option( + ..., + help='The path to the config file of the task', + exists=True, + ), + work_dirs: List[Path] = Option( + ..., + help='The work dirs for the task(named by timestamp), ' + 'need to ensure the order is the same as cfg_paths.', + exists=True, + ), + group: str = Option(None, + help='If not None, show the accuracy in the group.'), +): + assert len(cfg_paths) == len(work_dirs) + cfgs = [Config.fromfile(it, format_python_code=False) for it in cfg_paths] + + multi_models_summarizer = None + for cfg, work_dir in zip(cfgs, work_dirs): + cfg['work_dir'] = work_dir + summarizer_cfg = cfg.get('summarizer', {}) + summarizer_cfg['type'] = MultiModelSummarizer + summarizer_cfg['config'] = cfg + summarizer = build_from_cfg(summarizer_cfg) + if multi_models_summarizer is None: + multi_models_summarizer = summarizer + else: + multi_models_summarizer.merge(summarizer) + multi_models_summarizer.summarize() + if group: + multi_models_summarizer.show_group(group) + + +if __name__ == '__main__': + app()