Spaces:
Sleeping
Sleeping
Synced repo using 'sync_with_huggingface' Github Action
Browse files- .gitattributes +1 -0
- Dockerfile +27 -0
- LICENSE +201 -0
- __pycache__/custom_prompt_template.cpython-311.pyc +0 -0
- __pycache__/custom_prompt_template.cpython-39.pyc +0 -0
- app.py +451 -0
- custom_prompt_template.py +43 -0
- data-downloader/download_eval_data.sh +68 -0
- data-downloader/download_instructions_data.sh +120 -0
- olive_farm.png +3 -0
- open_instruct/get_data_stats.py +121 -0
- open_instruct/reformat_data.py +551 -0
- requirements.txt +6 -0
- web-app.py +67 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
olive_farm.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.8-slim-buster
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY ./requirements.txt /app/requirements.txt
|
| 6 |
+
# COPY ./packages.txt /app/packages.txt
|
| 7 |
+
|
| 8 |
+
# RUN apt-get update && xargs -r -a /app/packages.txt apt-get install -y && rm -rf /var/lib/apt/lists/*
|
| 9 |
+
RUN pip3 install --no-cache-dir -r /app/requirements.txt
|
| 10 |
+
|
| 11 |
+
# User
|
| 12 |
+
RUN useradd -m -u 1000 user
|
| 13 |
+
USER user
|
| 14 |
+
ENV HOME /home/user
|
| 15 |
+
ENV PATH $HOME/.local/bin:$PATH
|
| 16 |
+
|
| 17 |
+
WORKDIR $HOME
|
| 18 |
+
RUN mkdir app
|
| 19 |
+
WORKDIR $HOME/app
|
| 20 |
+
COPY . $HOME/app
|
| 21 |
+
|
| 22 |
+
EXPOSE 8501
|
| 23 |
+
CMD streamlit run app.py \
|
| 24 |
+
--server.headless true \
|
| 25 |
+
--server.enableCORS false \
|
| 26 |
+
--server.enableXsrfProtection false \
|
| 27 |
+
--server.fileWatcherType none
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
__pycache__/custom_prompt_template.cpython-311.pyc
ADDED
|
Binary file (2.41 kB). View file
|
|
|
__pycache__/custom_prompt_template.cpython-39.pyc
ADDED
|
Binary file (1.33 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,451 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import requests
|
| 3 |
+
import justext
|
| 4 |
+
import pdfplumber
|
| 5 |
+
import docx2txt
|
| 6 |
+
import json
|
| 7 |
+
import ast
|
| 8 |
+
import os
|
| 9 |
+
import re
|
| 10 |
+
import openai
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
st.set_page_config(page_title="LLM instruction Generator")
|
| 17 |
+
|
| 18 |
+
st.sidebar.success("Select a page above")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# function for the odia stoplists justext
|
| 22 |
+
def odia_stoplist():
|
| 23 |
+
odia_stopwords = [
|
| 24 |
+
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
|
| 25 |
+
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
|
| 26 |
+
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
|
| 27 |
+
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
|
| 28 |
+
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
|
| 29 |
+
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
|
| 30 |
+
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
|
| 31 |
+
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
|
| 32 |
+
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
|
| 33 |
+
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
|
| 34 |
+
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
|
| 35 |
+
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
|
| 36 |
+
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
|
| 37 |
+
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
|
| 38 |
+
]
|
| 39 |
+
return frozenset(odia_stopwords)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# function to extract data from url using justext
|
| 43 |
+
def extract_data_from_url(url, language):
|
| 44 |
+
try:
|
| 45 |
+
response = requests.get(url)
|
| 46 |
+
|
| 47 |
+
if response.status_code == 200:
|
| 48 |
+
print("inside the response")
|
| 49 |
+
response.raise_for_status()
|
| 50 |
+
page = response.content
|
| 51 |
+
para = ""
|
| 52 |
+
if language == "English":
|
| 53 |
+
paragraphs = justext.justext(page, justext.get_stoplist("English"))
|
| 54 |
+
elif language == "Hindi":
|
| 55 |
+
paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
|
| 56 |
+
elif language == "Odia":
|
| 57 |
+
paragraphs = justext.justext(
|
| 58 |
+
page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
for paragraph in paragraphs:
|
| 62 |
+
if not paragraph.is_boilerplate:
|
| 63 |
+
para = para + "\n" + paragraph.text
|
| 64 |
+
# returning the extracted data i.e para as string
|
| 65 |
+
if para == "":
|
| 66 |
+
st.error("Unable to extract data from the URL")
|
| 67 |
+
return None
|
| 68 |
+
else:
|
| 69 |
+
return para
|
| 70 |
+
else:
|
| 71 |
+
st.error("Request failed ")
|
| 72 |
+
return None
|
| 73 |
+
except Exception as err:
|
| 74 |
+
st.error(err)
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# function to extract data from documents
|
| 81 |
+
def extract_data_from_documents(documents):
|
| 82 |
+
data = ""
|
| 83 |
+
if documents is not None:
|
| 84 |
+
for document in documents:
|
| 85 |
+
document_details = {
|
| 86 |
+
"filename": document.name,
|
| 87 |
+
"filetype": document.type,
|
| 88 |
+
"filesize": document.size,
|
| 89 |
+
}
|
| 90 |
+
st.write(document_details)
|
| 91 |
+
|
| 92 |
+
# Extract content from the txt file
|
| 93 |
+
if document.type == "text/plain":
|
| 94 |
+
# Read as bytes
|
| 95 |
+
data += str(document.read(), "utf-8")
|
| 96 |
+
|
| 97 |
+
# Extract content from the pdf file
|
| 98 |
+
elif document.type == "application/pdf":
|
| 99 |
+
# using pdfplumber
|
| 100 |
+
try:
|
| 101 |
+
with pdfplumber.open(document) as pdf:
|
| 102 |
+
all_text = ""
|
| 103 |
+
for page in pdf.pages:
|
| 104 |
+
text = page.extract_text()
|
| 105 |
+
all_text += text + "\n"
|
| 106 |
+
data += all_text
|
| 107 |
+
except requests.exceptions.RequestException as e:
|
| 108 |
+
st.write("None")
|
| 109 |
+
|
| 110 |
+
# Extract content from the docx file
|
| 111 |
+
elif (
|
| 112 |
+
document.type
|
| 113 |
+
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
| 114 |
+
):
|
| 115 |
+
data += docx2txt.process(document)
|
| 116 |
+
|
| 117 |
+
# return extract data
|
| 118 |
+
return data
|
| 119 |
+
else:
|
| 120 |
+
st.error("Error: An error occurred while fetching content.")
|
| 121 |
+
# return extract status, and the data extracted
|
| 122 |
+
return None
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# function for the keyboard
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# Check the inputs for language, promptType
|
| 130 |
+
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
|
| 131 |
+
langFlag = False
|
| 132 |
+
promptFlag = False
|
| 133 |
+
noOfQuestionFlag = False
|
| 134 |
+
instructionFormatFlag = False
|
| 135 |
+
|
| 136 |
+
if language:
|
| 137 |
+
langFlag = True
|
| 138 |
+
if promptType:
|
| 139 |
+
promptFlag = True
|
| 140 |
+
if noOfQuestions:
|
| 141 |
+
noOfQuestionFlag = True
|
| 142 |
+
if instructionFormat:
|
| 143 |
+
instructionFormatFlag = True
|
| 144 |
+
# checking for the compalsory inputs and return true only if all are set
|
| 145 |
+
return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def main():
|
| 149 |
+
# setting up the initial session_states
|
| 150 |
+
if "extract_button" not in st.session_state:
|
| 151 |
+
st.session_state.extract_button = False
|
| 152 |
+
if "submit" not in st.session_state:
|
| 153 |
+
st.session_state.submit = False
|
| 154 |
+
if "generated" not in st.session_state:
|
| 155 |
+
st.session_state.generated = False
|
| 156 |
+
if "selected" not in st.session_state:
|
| 157 |
+
st.session_state.selected = False
|
| 158 |
+
if "answered" not in st.session_state:
|
| 159 |
+
st.session_state.answered = False
|
| 160 |
+
|
| 161 |
+
st.subheader("LLM Instructions")
|
| 162 |
+
|
| 163 |
+
# form to get the inputs
|
| 164 |
+
with st.form(key="form1"):
|
| 165 |
+
st.write("#")
|
| 166 |
+
|
| 167 |
+
# dropdown for language
|
| 168 |
+
language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))
|
| 169 |
+
|
| 170 |
+
# dropdown for prompt type
|
| 171 |
+
promptType = st.selectbox(
|
| 172 |
+
"Select the Prompt type", ("", "Input text", "Url", "Document")
|
| 173 |
+
)
|
| 174 |
+
# inputs for number
|
| 175 |
+
noOfQuestions = st.number_input(
|
| 176 |
+
"Number of questions to generate:", min_value=1, max_value=20, value=10
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
# dropdown for language
|
| 180 |
+
instructionFormat = st.selectbox(
|
| 181 |
+
"Format of instruction:", ("Imperative sentence", "Question")
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
# input text for openAiKey
|
| 185 |
+
openAiKey = st.text_input(label="Input the openai key")
|
| 186 |
+
if "openAiKey" in st.session_state:
|
| 187 |
+
st.session_state["openAiKey"] = openAiKey
|
| 188 |
+
else:
|
| 189 |
+
st.session_state["openAiKey"] = openAiKey
|
| 190 |
+
|
| 191 |
+
st.write("##")
|
| 192 |
+
|
| 193 |
+
# form submit button and setting up the session_state
|
| 194 |
+
if st.form_submit_button():
|
| 195 |
+
st.session_state.submit = True
|
| 196 |
+
|
| 197 |
+
if st.session_state.submit:
|
| 198 |
+
# extends the prompt form to extract the data
|
| 199 |
+
with st.expander(label="prompt"):
|
| 200 |
+
with st.form(key="form2"):
|
| 201 |
+
# calling the function inside if to check valid drop down inputs
|
| 202 |
+
if valid_drop_down(
|
| 203 |
+
language, promptType, noOfQuestions, instructionFormat
|
| 204 |
+
):
|
| 205 |
+
if promptType == "Input text":
|
| 206 |
+
inputText = st.text_area(
|
| 207 |
+
label="For Instructions",
|
| 208 |
+
placeholder="Please enter your text here",
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
elif promptType == "Url":
|
| 212 |
+
url = st.text_input(
|
| 213 |
+
label="For URL", placeholder="Please enter your text here"
|
| 214 |
+
)
|
| 215 |
+
elif promptType == "Document":
|
| 216 |
+
documents = st.file_uploader(
|
| 217 |
+
label="For Documents ( pdf / txt / docx )",
|
| 218 |
+
type=["pdf", "txt", "docx"],
|
| 219 |
+
accept_multiple_files=True,
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
# if addInfoCheckbox:
|
| 223 |
+
# additionalInfo = st.text_input(
|
| 224 |
+
# label="Additional Instructions",
|
| 225 |
+
# placeholder="Please enter your text here",
|
| 226 |
+
# )
|
| 227 |
+
|
| 228 |
+
if st.form_submit_button():
|
| 229 |
+
st.session_state.extract_button = True
|
| 230 |
+
# st.experimental_rerun()
|
| 231 |
+
|
| 232 |
+
# extracting data
|
| 233 |
+
if st.session_state.extract_button:
|
| 234 |
+
# extracting data
|
| 235 |
+
|
| 236 |
+
if promptType == "Input text":
|
| 237 |
+
extractedData = inputText
|
| 238 |
+
|
| 239 |
+
elif promptType == "Url":
|
| 240 |
+
extractedURLData = extract_data_from_url(url, language)
|
| 241 |
+
if extractedURLData is not None:
|
| 242 |
+
extractedData = extractedURLData
|
| 243 |
+
st.text_area("Extracted Text:", value=extractedData, height=200)
|
| 244 |
+
else:
|
| 245 |
+
extractedData = False
|
| 246 |
+
elif promptType == "Document":
|
| 247 |
+
if not documents:
|
| 248 |
+
documents = None
|
| 249 |
+
else:
|
| 250 |
+
for doc in documents:
|
| 251 |
+
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
|
| 252 |
+
# if documents is not the relevant type
|
| 253 |
+
st.error("Unsupported file: " + doc.name)
|
| 254 |
+
|
| 255 |
+
extractedDocumentData = extract_data_from_documents(documents)
|
| 256 |
+
extractedData = extractedDocumentData
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
# if the values are extracted running the custom prompt by creating an instance
|
| 260 |
+
if extractedData:
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# ----------------------------- RUNNING THE PROMPT -----------------------------
|
| 264 |
+
if "extractedData" not in st.session_state:
|
| 265 |
+
st.session_state["extractedData"] = extractedData
|
| 266 |
+
else:
|
| 267 |
+
st.session_state["extractedData"] = extractedData
|
| 268 |
+
|
| 269 |
+
if "Initial" not in st.session_state:
|
| 270 |
+
st.session_state.Initial=True
|
| 271 |
+
|
| 272 |
+
if st.session_state.Initial == True:
|
| 273 |
+
|
| 274 |
+
# running the prompt form here
|
| 275 |
+
|
| 276 |
+
openai.api_key = st.session_state["openAiKey"]
|
| 277 |
+
my_prompt_template = InstructionGenerationTemplate()
|
| 278 |
+
|
| 279 |
+
# providing the rules for the instructions to be generated
|
| 280 |
+
additional_rules = """
|
| 281 |
+
- You do not need to provide a response to the generated examples.
|
| 282 |
+
- You must return the response in the specified language.
|
| 283 |
+
- Each generated instruction can be either an imperative sentence or a question.
|
| 284 |
+
"""
|
| 285 |
+
|
| 286 |
+
if st.button("Generate Instructions"):
|
| 287 |
+
prompt = my_prompt_template.format(
|
| 288 |
+
num_questions=noOfQuestions,
|
| 289 |
+
context=extractedData,
|
| 290 |
+
instruction_format=instructionFormat,
|
| 291 |
+
lang=language,
|
| 292 |
+
additional_rules=additional_rules
|
| 293 |
+
)
|
| 294 |
+
response = openai.ChatCompletion.create(
|
| 295 |
+
model="gpt-3.5-turbo",
|
| 296 |
+
messages=[
|
| 297 |
+
{"role": "system", "content": prompt},
|
| 298 |
+
])
|
| 299 |
+
# if st.button("Generate Instructions"):
|
| 300 |
+
print("Generate button")
|
| 301 |
+
print("Checkpoint 1!")
|
| 302 |
+
|
| 303 |
+
if "result" not in st.session_state:
|
| 304 |
+
content = response.choices[0].message.content
|
| 305 |
+
# content = "\n1. helloworld1.\n2. helloworld2"
|
| 306 |
+
responses_list = content.split('\n')
|
| 307 |
+
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
|
| 308 |
+
st.session_state["result"]=responses_list
|
| 309 |
+
st.session_state.generated = True
|
| 310 |
+
st.session_state.Initial = False
|
| 311 |
+
if st.session_state.generated:
|
| 312 |
+
# displaying the generated instructions
|
| 313 |
+
st.write("Generated Insuctions")
|
| 314 |
+
result = st.session_state["result"]
|
| 315 |
+
# print(type(result))
|
| 316 |
+
# print(result)
|
| 317 |
+
result_dict = {i+1: value for i,value in enumerate(result)}
|
| 318 |
+
selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
|
| 319 |
+
# print(type(result_dict))
|
| 320 |
+
# print(result_dict)
|
| 321 |
+
print("Checked point 2!")
|
| 322 |
+
# Display the selected items as a list
|
| 323 |
+
if selected_items:
|
| 324 |
+
st.write("Selected Items:")
|
| 325 |
+
st.write(selected_items)
|
| 326 |
+
if "selected_items" not in st.session_state:
|
| 327 |
+
st.session_state["selected_items"] = selected_items
|
| 328 |
+
st.session_state["selected_items"] = selected_items
|
| 329 |
+
st.session_state.selected = True
|
| 330 |
+
else:
|
| 331 |
+
st.write("No items selected.")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
# ----------------------------- RUNNING THE PROMPT FOR ANSWER GENERATION -----------------------------
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
if st.session_state.selected:
|
| 340 |
+
|
| 341 |
+
if "Initial2" not in st.session_state:
|
| 342 |
+
st.session_state.Initial2=True
|
| 343 |
+
|
| 344 |
+
if st.session_state.Initial2:
|
| 345 |
+
# running the prompt form here
|
| 346 |
+
openai.api_key = st.session_state["openAiKey"]
|
| 347 |
+
my_prompt_template2 = AnswerGenerationTemplate()
|
| 348 |
+
|
| 349 |
+
# providing the rules for the answers to be generated
|
| 350 |
+
additional_rules = """
|
| 351 |
+
Enumerate the answers and dont provide any additional tags.
|
| 352 |
+
"""
|
| 353 |
+
|
| 354 |
+
question = st.session_state["selected_items"]
|
| 355 |
+
if st.button("Generate Answers"):
|
| 356 |
+
prompt = my_prompt_template2.format(
|
| 357 |
+
questions=question,
|
| 358 |
+
additional_rules = additional_rules
|
| 359 |
+
)
|
| 360 |
+
response = openai.ChatCompletion.create(
|
| 361 |
+
model="gpt-3.5-turbo",
|
| 362 |
+
messages=[
|
| 363 |
+
{"role": "system", "content": prompt},
|
| 364 |
+
])
|
| 365 |
+
|
| 366 |
+
# if st.button("Generate Answers"):
|
| 367 |
+
# print("\n\n\n\nInside Answersss:\n\n\n\n")
|
| 368 |
+
# print(st.session_state["selected_items"])
|
| 369 |
+
|
| 370 |
+
# print("Generate button")
|
| 371 |
+
# print("Checkpoint 3!")
|
| 372 |
+
|
| 373 |
+
if "answers" not in st.session_state:
|
| 374 |
+
content = response.choices[0].message.content
|
| 375 |
+
# content = "\n1. Answer1.\n2. Answer2"
|
| 376 |
+
print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
|
| 377 |
+
print(content)
|
| 378 |
+
# print("Answer Type:" + str(type(content)))
|
| 379 |
+
responses_list = content.split('\n')
|
| 380 |
+
# print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
|
| 381 |
+
# print(responses_list)
|
| 382 |
+
# print("Answer Type:" + str(type(responses_list)))
|
| 383 |
+
|
| 384 |
+
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
|
| 385 |
+
st.session_state["answers"]=responses_list
|
| 386 |
+
st.session_state.answered = True
|
| 387 |
+
st.session_state.Initial2 = False
|
| 388 |
+
|
| 389 |
+
if st.session_state.answered:
|
| 390 |
+
# displaying the generated Answers
|
| 391 |
+
|
| 392 |
+
questions = st.session_state["selected_items"]
|
| 393 |
+
answers = st.session_state["answers"]
|
| 394 |
+
# print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
|
| 395 |
+
# print(answers)
|
| 396 |
+
# print("Answer Type:" + str(type(answers)))
|
| 397 |
+
answers_dict = {i+1: value for i,value in enumerate(answers)}
|
| 398 |
+
# print(type(answers_dict))
|
| 399 |
+
# print(answers_dict)
|
| 400 |
+
# print("Checked point 4!")
|
| 401 |
+
# st.write("answers")
|
| 402 |
+
st.write(answers_dict)
|
| 403 |
+
|
| 404 |
+
# Create a list to hold the JSON-like data
|
| 405 |
+
st.write("Generated Questions and Answers")
|
| 406 |
+
# Create a list of dictionaries
|
| 407 |
+
jsonl_data = [{"Question": question, "Answer": answers_dict.get(i, 'No answer found')} for i, question in enumerate(questions, start=1)]
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
st.write(jsonl_data)
|
| 411 |
+
jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)
|
| 412 |
+
|
| 413 |
+
# Display the JSONL data
|
| 414 |
+
print(jsonl_string)
|
| 415 |
+
|
| 416 |
+
if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
|
| 417 |
+
st.success("Successfully saved")
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
if st.button("Clear"):
|
| 423 |
+
st.session_state.extract_button = False
|
| 424 |
+
st.session_state.submit = False
|
| 425 |
+
st.session_state.generated = False
|
| 426 |
+
st.session_state.selected = False
|
| 427 |
+
st.session_state.answered = False
|
| 428 |
+
|
| 429 |
+
|
| 430 |
+
|
| 431 |
+
if "Initial" in st.session_state:
|
| 432 |
+
st.session_state.Initial = True
|
| 433 |
+
if "Initial2" in st.session_state:
|
| 434 |
+
st.session_state.Initial2 = True
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
if "openAiKey" in st.session_state:
|
| 438 |
+
del st.session_state["openAiKey"]
|
| 439 |
+
if "extractedData" in st.session_state:
|
| 440 |
+
del st.session_state["extractedData"]
|
| 441 |
+
if "result" in st.session_state:
|
| 442 |
+
del st.session_state["result"]
|
| 443 |
+
if "selected_items" in st.session_state:
|
| 444 |
+
del st.session_state["selected_items"]
|
| 445 |
+
if "answered" in st.session_state:
|
| 446 |
+
del st.session_state["answers"]
|
| 447 |
+
st.experimental_rerun()
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
if __name__ == "__main__":
|
| 451 |
+
main()
|
custom_prompt_template.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import langchain
|
| 3 |
+
class InstructionGenerationTemplate(langchain.prompts.PromptTemplate):
|
| 4 |
+
"""A custom prompt template for generating instructions."""
|
| 5 |
+
|
| 6 |
+
input_variables: List[str] = ["num_questions", "context", "instruction_format", "lang", "additional_rules"]
|
| 7 |
+
|
| 8 |
+
template = """
|
| 9 |
+
You are a highly intelligent language model trained to assist with a variety of language tasks. Your task here is to generate {num_questions} diverse questions or instructions based on the context provided below:
|
| 10 |
+
|
| 11 |
+
Context:
|
| 12 |
+
{context}
|
| 13 |
+
|
| 14 |
+
Please follow these rules:
|
| 15 |
+
{additional_rules}
|
| 16 |
+
|
| 17 |
+
Please generate the instructions in the {instruction_format} format and in {lang} language. Remember to adhere to the rules mentioned above.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
template_format = "f-string"
|
| 21 |
+
def format(self, **kwargs):
|
| 22 |
+
"""Format the prompt."""
|
| 23 |
+
return self.template.format(**kwargs)
|
| 24 |
+
|
| 25 |
+
class AnswerGenerationTemplate(langchain.prompts.PromptTemplate):
|
| 26 |
+
"""A custom prompt template for generating answers to questions."""
|
| 27 |
+
|
| 28 |
+
input_variables: List[str] = ["questions", "additional_rules"]
|
| 29 |
+
|
| 30 |
+
template = """
|
| 31 |
+
You are a highly intelligent language model tasked with providing answers to the following questions :
|
| 32 |
+
|
| 33 |
+
Questions:
|
| 34 |
+
{questions}
|
| 35 |
+
|
| 36 |
+
Please follow these rules:
|
| 37 |
+
{additional_rules}
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
template_format = "f-string"
|
| 41 |
+
def format(self, **kwargs):
|
| 42 |
+
"""Format the prompt."""
|
| 43 |
+
return self.template.format(**kwargs)
|
data-downloader/download_eval_data.sh
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
mkdir -p data/downloads
|
| 2 |
+
mkdir -p data/eval
|
| 3 |
+
|
| 4 |
+
# MMLU dataset
|
| 5 |
+
wget -O data/downloads/mmlu_data.tar https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
| 6 |
+
mkdir -p data/downloads/mmlu_data
|
| 7 |
+
tar -xvf data/downloads/mmlu_data.tar -C data/downloads/mmlu_data
|
| 8 |
+
mv data/downloads/mmlu_data/data data/eval/mmlu && rm -r data/downloads/mmlu_data data/downloads/mmlu_data.tar
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Big-Bench-Hard dataset
|
| 12 |
+
wget -O data/downloads/bbh_data.zip https://github.com/suzgunmirac/BIG-Bench-Hard/archive/refs/heads/main.zip
|
| 13 |
+
mkdir -p data/downloads/bbh
|
| 14 |
+
unzip data/downloads/bbh_data.zip -d data/downloads/bbh
|
| 15 |
+
mv data/downloads/bbh/BIG-Bench-Hard-main/ data/eval/bbh && rm -r data/downloads/bbh data/downloads/bbh_data.zip
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Super-NaturalInstructions dataset
|
| 19 |
+
wget -O data/downloads/superni_data.zip https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
|
| 20 |
+
mkdir -p data/downloads/superni
|
| 21 |
+
unzip data/downloads/superni_data.zip -d data/downloads/superni
|
| 22 |
+
mv data/downloads/superni/natural-instructions-master/ data/eval/superni && rm -r data/downloads/superni data/downloads/superni_data.zip
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# TyDiQA-GoldP dataset
|
| 26 |
+
mkdir -p data/eval/tydiqa
|
| 27 |
+
wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-dev.json
|
| 28 |
+
wget -P data/eval/tydiqa/ https://storage.googleapis.com/tydiqa/v1.1/tydiqa-goldp-v1.1-train.json
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# XOR-QA dataset
|
| 32 |
+
wget -P data/eval/xorqa/ https://raw.githubusercontent.com/mia-workshop/MIA-Shared-Task-2022/main/data/eval/mia_2022_dev_xorqa.jsonl
|
| 33 |
+
wget -P data/eval/xorqa/ https://github.com/mia-workshop/MIA-Shared-Task-2022/raw/main/data/train/mia_2022_train_data.jsonl.zip
|
| 34 |
+
unzip data/eval/xorqa/mia_2022_train_data.jsonl.zip -d data/eval/xorqa/ && rm data/eval/xorqa/mia_2022_train_data.jsonl.zip
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# GSM dataset
|
| 38 |
+
wget -P data/eval/gsm/ https://github.com/openai/grade-school-math/raw/master/grade_school_math/data/test.jsonl
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# Multilingual GSM dataset
|
| 42 |
+
wget -O data/downloads/url-nlp.zip https://github.com/google-research/url-nlp/archive/refs/heads/main.zip
|
| 43 |
+
mkdir -p data/downloads/url-nlp
|
| 44 |
+
unzip data/downloads/url-nlp.zip -d data/downloads/url-nlp
|
| 45 |
+
mv data/downloads/url-nlp/url-nlp-main/mgsm data/eval/mgsm && rm -r data/downloads/url-nlp data/downloads/url-nlp.zip
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Codex HumanEval
|
| 49 |
+
wget -P data/eval/codex_humaneval https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# TruthfulQA
|
| 53 |
+
wget -P data/eval/truthfulqa https://github.com/sylinrl/TruthfulQA/raw/main/TruthfulQA.csv
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Self-instruct eval, Vicuna eval, and Koala eval for creative instructions/tasks
|
| 57 |
+
mkdir -p data/eval/creative_tasks
|
| 58 |
+
wget -O data/eval/creative_tasks/self_instruct_test.jsonl https://github.com/yizhongw/self-instruct/raw/main/human_eval/user_oriented_instructions.jsonl
|
| 59 |
+
wget -O data/eval/creative_tasks/vicuna_test.jsonl https://github.com/lm-sys/FastChat/raw/main/fastchat/eval/table/question.jsonl
|
| 60 |
+
wget -O data/eval/creative_tasks/koala_test.jsonl https://github.com/arnav-gudibande/koala-test-set/raw/main/koala_test_set.jsonl
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Toxigen data
|
| 64 |
+
mkdir -p data/eval/toxigen
|
| 65 |
+
for minority_group in asian black chinese jewish latino lgbtq mental_disability mexican middle_east muslim native_american physical_disability trans women
|
| 66 |
+
do
|
| 67 |
+
wget -O data/eval/toxigen/hate_${minority_group}.txt https://raw.githubusercontent.com/microsoft/TOXIGEN/main/prompts/hate_${minority_group}_1k.txt
|
| 68 |
+
done
|
data-downloader/download_instructions_data.sh
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# check if there is $HF_TOKEN in the environment variables
|
| 2 |
+
if [ -z "$HF_TOKEN" ]
|
| 3 |
+
then
|
| 4 |
+
echo "Warning: HuggingFace dataset LIMA requires permissive access."
|
| 5 |
+
echo "Warning: Please request the access at https://huggingface.co/datasets/GAIR/lima and set the HF_TOKEN environment variable before running this script."
|
| 6 |
+
exit 1
|
| 7 |
+
fi
|
| 8 |
+
|
| 9 |
+
echo "Downloading Super-NaturalInstructions dataset..."
|
| 10 |
+
wget -P data/raw_train/super_ni/ https://github.com/allenai/natural-instructions/archive/refs/heads/master.zip
|
| 11 |
+
unzip data/raw_train/super_ni/master.zip -d data/raw_train/super_ni/ && rm data/raw_train/super_ni/master.zip
|
| 12 |
+
mv data/raw_train/super_ni/natural-instructions-master/* data/raw_train/super_ni/ && rm -r data/raw_train/super_ni/natural-instructions-master
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
echo "Downloading the flan_v2 chain-of-thought submix..."
|
| 16 |
+
wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ52K2Q932H6KZY499A7FE8/files/cot_zsopt.jsonl
|
| 17 |
+
wget -P data/raw_train/cot/ https://beaker.org/api/v3/datasets/01GXZ51ZV283RAZW7J3ECM4S58/files/cot_fsopt.jsonl
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
echo "Downloading the flan_v2 collection, here we subsampled only 100K instances..."
|
| 21 |
+
wget -P data/raw_train/flan_v2/ https://beaker.org/api/v3/datasets/01GZTTS2EJFPA83PXS4FQCS1SA/files/flan_v2_resampled_100k.jsonl
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
echo "Downloading self-instruct data..."
|
| 25 |
+
wget -P data/raw_train/self_instruct/ https://raw.githubusercontent.com/yizhongw/self-instruct/main/data/gpt3_generations/batch_221203/all_instances_82K.jsonl
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
echo "Downloading unnatural-instructions data..."
|
| 29 |
+
wget -P data/raw_train/unnatural_instructions/ https://github.com/orhonovich/unnatural-instructions/raw/main/data/core_data.zip
|
| 30 |
+
unzip data/raw_train/unnatural_instructions/core_data.zip -d data/raw_train/unnatural_instructions/
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
echo "Downloading Stanford alpaca data..."
|
| 34 |
+
wget -P data/raw_train/stanford_alpaca/ https://github.com/tatsu-lab/stanford_alpaca/raw/main/alpaca_data.json
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
echo "Downloading the dolly dataset..."
|
| 38 |
+
wget -P data/raw_train/dolly/ https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
echo "Downloading the OpenAssistant data (oasst1)..."
|
| 42 |
+
wget -P data/raw_train/oasst1/ https://huggingface.co/datasets/OpenAssistant/oasst1/resolve/main/2023-04-12_oasst_ready.trees.jsonl.gz
|
| 43 |
+
gzip -d data/raw_train/oasst1/2023-04-12_oasst_ready.trees.jsonl.gz
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
echo "Downloading the code alpaca dataset..."
|
| 47 |
+
wget -P data/raw_train/code_alpaca/ https://github.com/sahil280114/codealpaca/raw/master/data/code_alpaca_20k.json
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
echo "Downloading the gpt4-llm dataset..."
|
| 51 |
+
wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json
|
| 52 |
+
wget -P data/raw_train/gpt4_alpaca/ https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data_zh.json
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
echo "Downloading the baize dataset..."
|
| 56 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/alpaca_chat_data.json
|
| 57 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/medical_chat_data.json
|
| 58 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/quora_chat_data.json
|
| 59 |
+
wget -P data/raw_train/baize/ https://github.com/project-baize/baize-chatbot/raw/main/data/stackoverflow_chat_data.json
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
echo "Downloading ShareGPT dataset..."
|
| 63 |
+
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part1_html_cleaned.json
|
| 64 |
+
wget -P data/raw_train/sharegpt/ https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/HTML_cleaned_raw_dataset/sg_90k_part2_html_cleaned.json
|
| 65 |
+
echo "Splitting the ShareGPT dataset..."
|
| 66 |
+
python scripts/split_sharegpt_conversations.py \
|
| 67 |
+
--in-files data/raw_train/sharegpt/sg_90k_part1_html_cleaned.json data/raw_train/sharegpt/sg_90k_part2_html_cleaned.json \
|
| 68 |
+
--out-file data/raw_train/sharegpt/sharegpt_html_cleaned_and_split.json \
|
| 69 |
+
--model-name-or-path ../hf_llama_models/7B/
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
echo "Downloading LIMA dataset..."
|
| 73 |
+
wget --header="Authorization: Bearer $HF_TOKEN" -P data/raw_train/lima/ https://huggingface.co/datasets/GAIR/lima/raw/main/train.jsonl
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
echo "Downloading WizardLM dataset..."
|
| 77 |
+
wget -P data/raw_train/wizardlm/ https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k/resolve/main/WizardLM_evol_instruct_V2_143k.json
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
echo "Downloading the OpenOrca dataset..."
|
| 81 |
+
wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/1M-GPT4-Augmented.parquet
|
| 82 |
+
wget -P data/raw_train/open_orca/ https://huggingface.co/datasets/Open-Orca/OpenOrca/resolve/main/3_5M-GPT3_5-Augmented.parquet
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
echo "Reformatting the datasets..."
|
| 86 |
+
python open_instruct/reformat_datasets.py --raw_data_dir data/raw_train/ --output_dir data/processed/
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
echo "Creating Tulu data mixtures..."
|
| 90 |
+
mkdir -p data/processed/tulu/
|
| 91 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
| 92 |
+
data/processed/cot/cot_data.jsonl \
|
| 93 |
+
data/processed/dolly/dolly_data.jsonl \
|
| 94 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
| 95 |
+
data/processed/gpt4_alpaca/gpt4_alpaca_data.jsonl \
|
| 96 |
+
data/processed/code_alpaca/code_alpaca_data.jsonl \
|
| 97 |
+
data/processed/sharegpt/sharegpt_data.jsonl \
|
| 98 |
+
> data/processed/tulu/tulu_v1_mix.jsonl
|
| 99 |
+
|
| 100 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
| 101 |
+
data/processed/cot/cot_data.jsonl \
|
| 102 |
+
data/processed/dolly/dolly_data.jsonl \
|
| 103 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
| 104 |
+
> data/processed/tulu/tulu_v1_human_mix.jsonl
|
| 105 |
+
|
| 106 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
| 107 |
+
data/processed/cot/cot_data.jsonl \
|
| 108 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
| 109 |
+
data/processed/lima/lima_data.jsonl \
|
| 110 |
+
data/processed/code_alpaca/code_alpaca_data.jsonl \
|
| 111 |
+
data/processed/sharegpt/sharegpt_data.jsonl \
|
| 112 |
+
data/processed/wizardlm/wizardlm_data.jsonl \
|
| 113 |
+
data/processed/open_orca/open_orca_data.jsonl \
|
| 114 |
+
> data/processed/tulu/tulu_v2_mix.jsonl
|
| 115 |
+
|
| 116 |
+
cat data/processed/flan_v2/flan_v2_data.jsonl \
|
| 117 |
+
data/processed/cot/cot_data.jsonl \
|
| 118 |
+
data/processed/oasst1/oasst1_data.jsonl \
|
| 119 |
+
data/processed/lima/lima_data.jsonl \
|
| 120 |
+
> data/processed/tulu/tulu_v2_human_mix.jsonl
|
olive_farm.png
ADDED
|
Git LFS Details
|
open_instruct/get_data_stats.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import tqdm
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
import argparse
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
from transformers import AutoTokenizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_statistics_for_messages_data(data_path):
|
| 13 |
+
# load dataset
|
| 14 |
+
dataset = load_dataset("json", data_files={"train": data_path})
|
| 15 |
+
# tokenize dataset
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B", use_fast=False)
|
| 17 |
+
# get statistics
|
| 18 |
+
num_instances = len(dataset["train"])
|
| 19 |
+
num_of_turns = [len(instance["messages"]) for instance in dataset["train"]]
|
| 20 |
+
user_prompt_lengths = []
|
| 21 |
+
assistant_response_lengths = []
|
| 22 |
+
instance_lengths = []
|
| 23 |
+
for instance in tqdm.tqdm(dataset["train"], desc="Processing instances"):
|
| 24 |
+
instance_length = 0
|
| 25 |
+
for message in instance["messages"]:
|
| 26 |
+
if message["role"] == "user":
|
| 27 |
+
user_prompt_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
|
| 28 |
+
instance_length += user_prompt_lengths[-1]
|
| 29 |
+
elif message["role"] == "assistant":
|
| 30 |
+
assistant_response_lengths.append(len(tokenizer(message["content"], truncation=False, add_special_tokens=False)["input_ids"]))
|
| 31 |
+
instance_length += assistant_response_lengths[-1]
|
| 32 |
+
instance_lengths.append(instance_length)
|
| 33 |
+
|
| 34 |
+
top_100_longest_instances = np.argsort(instance_lengths)[-100:][::-1].tolist()
|
| 35 |
+
top_100_longest_instances = [dataset["train"][i]["id"] for i in top_100_longest_instances]
|
| 36 |
+
|
| 37 |
+
result = {
|
| 38 |
+
"num_instances": num_instances,
|
| 39 |
+
"turns_summary": pd.Series(num_of_turns).describe(),
|
| 40 |
+
"user_prompt_lengths_summary": pd.Series(user_prompt_lengths).describe(),
|
| 41 |
+
"assistant_response_lengths_summary": pd.Series(assistant_response_lengths).describe(),
|
| 42 |
+
"total_lengths_summary": pd.Series(instance_lengths).describe(),
|
| 43 |
+
"num_instances_with_total_length_gt_512": np.sum(np.array(instance_lengths) > 512),
|
| 44 |
+
"num_instances_with_total_length_gt_768": np.sum(np.array(instance_lengths) > 768),
|
| 45 |
+
"num_instances_with_total_length_gt_1024": np.sum(np.array(instance_lengths) > 1024),
|
| 46 |
+
"num_instances_with_total_length_gt_1536": np.sum(np.array(instance_lengths) > 1536),
|
| 47 |
+
"num_instances_with_total_length_gt_2048": np.sum(np.array(instance_lengths) > 2048),
|
| 48 |
+
"num_instances_with_total_length_gt_4096": np.sum(np.array(instance_lengths) > 4096),
|
| 49 |
+
"top_100_longest_instances": top_100_longest_instances,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# convert everything to dict or scalar
|
| 53 |
+
for key, value in result.items():
|
| 54 |
+
if isinstance(value, pd.Series):
|
| 55 |
+
result[key] = value.to_dict()
|
| 56 |
+
elif isinstance(value, np.ndarray):
|
| 57 |
+
result[key] = value.tolist()
|
| 58 |
+
elif isinstance(value, np.int64):
|
| 59 |
+
result[key] = int(value)
|
| 60 |
+
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
def get_statistics_for_prompt_completion_data(data_path):
|
| 64 |
+
# load dataset
|
| 65 |
+
dataset = load_dataset("json", data_files={"train": data_path})
|
| 66 |
+
prompts = [instance["prompt"] for instance in dataset["train"]]
|
| 67 |
+
completions = [instance["completion"] for instance in dataset["train"]]
|
| 68 |
+
# tokenize dataset
|
| 69 |
+
tokenizer = AutoTokenizer.from_pretrained("/net/nfs.cirrascale/allennlp/yizhongw/hf_llama_models/7B")
|
| 70 |
+
tokenized_prompts = tokenizer(prompts, truncation=False, add_special_tokens=False)
|
| 71 |
+
tokenized_completions = tokenizer(completions, truncation=False, add_special_tokens=False)
|
| 72 |
+
# get statistics
|
| 73 |
+
num_instances = len(dataset["train"])
|
| 74 |
+
prompt_lengths = [len(tokenized_prompts["input_ids"][i]) for i in range(num_instances)]
|
| 75 |
+
completion_lengths = [len(tokenized_completions["input_ids"][i]) for i in range(num_instances)]
|
| 76 |
+
prompt_completion_lengths = [prompt_lengths[i] + completion_lengths[i] for i in range(num_instances)]
|
| 77 |
+
|
| 78 |
+
result = {
|
| 79 |
+
"num_instances": num_instances,
|
| 80 |
+
"prompt_lengths_summary": pd.Series(prompt_lengths).describe(),
|
| 81 |
+
"completion_lengths_summary": pd.Series(completion_lengths).describe(),
|
| 82 |
+
"prompt_completion_lengths_summary": pd.Series(prompt_completion_lengths).describe(),
|
| 83 |
+
"num_instances_with_prompt_length_gt_512": np.sum(np.array(prompt_lengths) > 512),
|
| 84 |
+
"num_instances_with_completion_length_gt_512": np.sum(np.array(completion_lengths) > 512),
|
| 85 |
+
"num_instances_with_prompt_completion_length_gt_512": np.sum(np.array(prompt_completion_lengths) > 512),
|
| 86 |
+
"num_instances_with_completion_length_gt_768": np.sum(np.array(completion_lengths) > 768),
|
| 87 |
+
"num_instances_with_prompt_completion_length_gt_1024": np.sum(np.array(prompt_completion_lengths) > 1024),
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
# convert everything to dict or scalar
|
| 91 |
+
for key, value in result.items():
|
| 92 |
+
if isinstance(value, pd.Series):
|
| 93 |
+
result[key] = value.to_dict()
|
| 94 |
+
elif isinstance(value, np.ndarray):
|
| 95 |
+
result[key] = value.tolist()
|
| 96 |
+
elif isinstance(value, np.int64):
|
| 97 |
+
result[key] = int(value)
|
| 98 |
+
|
| 99 |
+
return result
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
if __name__ == "__main__":
|
| 103 |
+
parser = argparse.ArgumentParser()
|
| 104 |
+
parser.add_argument("--data_path", type=str, required=True)
|
| 105 |
+
parser.add_argument("--save_path", type=str, help="Path to save the statistics.")
|
| 106 |
+
args = parser.parse_args()
|
| 107 |
+
|
| 108 |
+
with open(args.data_path, "r") as f:
|
| 109 |
+
sample = json.loads(f.readline())
|
| 110 |
+
if "prompt" in sample:
|
| 111 |
+
statistics = get_statistics_for_prompt_completion_data(args.data_path)
|
| 112 |
+
elif "messages" in sample:
|
| 113 |
+
statistics = get_statistics_for_messages_data(args.data_path)
|
| 114 |
+
else:
|
| 115 |
+
raise ValueError("Invalid data format - the data should be either prompt completion data or messages data.")
|
| 116 |
+
|
| 117 |
+
print(json.dumps(statistics, indent=4))
|
| 118 |
+
|
| 119 |
+
if args.save_path is not None:
|
| 120 |
+
with open(args.save_path, "w") as f:
|
| 121 |
+
json.dump(statistics, f, indent=4)
|
open_instruct/reformat_data.py
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
# coding=utf-8
|
| 3 |
+
'''
|
| 4 |
+
This script is used to reformat the downloaded datasets into the format that can be used by the model.
|
| 5 |
+
Here we use jsonl for the converted data. Each line in the jsonl file is a json object formatted as follows:
|
| 6 |
+
{
|
| 7 |
+
"dataset": "dataset_name",
|
| 8 |
+
"id": "unique_id",
|
| 9 |
+
"messages": [
|
| 10 |
+
{"role": "system", "content": "message_text"}, # optional
|
| 11 |
+
{"role": "user", "content": "message_text"},
|
| 12 |
+
{"role": "assistant", "content": "message_text"},
|
| 13 |
+
{"role": "user", "content": "message_text"},
|
| 14 |
+
{"role": "assistant", "content": "message_text"},
|
| 15 |
+
...
|
| 16 |
+
],
|
| 17 |
+
}
|
| 18 |
+
'''
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
import random
|
| 22 |
+
import re
|
| 23 |
+
import os
|
| 24 |
+
import pandas as pd
|
| 25 |
+
import argparse
|
| 26 |
+
from instruction_encode_templates import encode_instruction_example, encode_few_shot_example
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def convert_super_ni_data(data_dir, output_dir, zero_shot_examples_per_task=60, few_shot_examples_per_task=20, n_few_shot=2):
|
| 30 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 31 |
+
train_tasks = []
|
| 32 |
+
with open(os.path.join(data_dir, "splits", "xlingual", "train_tasks.txt"), "r") as fin:
|
| 33 |
+
for line in fin:
|
| 34 |
+
if not "_mmmlu_" in line: # skip mmlu to avoid test leakage
|
| 35 |
+
train_tasks.append(line.strip())
|
| 36 |
+
with open(os.path.join(output_dir, "super_ni_data.jsonl"), "w") as fout:
|
| 37 |
+
for task in train_tasks:
|
| 38 |
+
with open(os.path.join(data_dir, "tasks", f"{task}.json"), "r") as fin:
|
| 39 |
+
task_data = json.load(fin)
|
| 40 |
+
instruction = task_data["Definition"][0]
|
| 41 |
+
if zero_shot_examples_per_task + few_shot_examples_per_task < len(task_data["Instances"]):
|
| 42 |
+
instances = random.sample(task_data["Instances"], k=zero_shot_examples_per_task+few_shot_examples_per_task)
|
| 43 |
+
else:
|
| 44 |
+
instances = task_data["Instances"]
|
| 45 |
+
for instance in instances[:zero_shot_examples_per_task]:
|
| 46 |
+
encoded_example = encode_instruction_example(
|
| 47 |
+
instruction=instruction,
|
| 48 |
+
input=instance["input"],
|
| 49 |
+
output=instance["output"][0],
|
| 50 |
+
random_template=True,
|
| 51 |
+
eos_token=None
|
| 52 |
+
)
|
| 53 |
+
fout.write(json.dumps({
|
| 54 |
+
"dataset": "super_ni",
|
| 55 |
+
"id": f"super_ni_{instance['id']}",
|
| 56 |
+
"messages": [
|
| 57 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 58 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 59 |
+
]
|
| 60 |
+
}) + "\n")
|
| 61 |
+
for instance in instances[zero_shot_examples_per_task:]:
|
| 62 |
+
if n_few_shot < len(task_data["Positive Examples"]):
|
| 63 |
+
examplars = random.sample(task_data["Positive Examples"], k=n_few_shot)
|
| 64 |
+
else:
|
| 65 |
+
examplars = task_data["Positive Examples"]
|
| 66 |
+
encoded_example = encode_few_shot_example(
|
| 67 |
+
instruction=instruction,
|
| 68 |
+
examplars=examplars,
|
| 69 |
+
input=instance["input"],
|
| 70 |
+
output=instance["output"][0],
|
| 71 |
+
eos_token=None
|
| 72 |
+
)
|
| 73 |
+
fout.write(json.dumps({
|
| 74 |
+
"dataset": "super_ni",
|
| 75 |
+
"id": f"super_ni_{instance['id']}",
|
| 76 |
+
"messages": [
|
| 77 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 78 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 79 |
+
]
|
| 80 |
+
}) + "\n")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def convert_cot_data(data_dir, output_dir, num_zero_shot_examples=50000, num_few_shot_examples=50000):
|
| 84 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 85 |
+
examples = []
|
| 86 |
+
if num_few_shot_examples > 0:
|
| 87 |
+
with open(os.path.join(data_dir, "cot_zsopt.jsonl"), "r") as fin:
|
| 88 |
+
zero_shot_examples = [json.loads(line) for line in fin]
|
| 89 |
+
if num_zero_shot_examples < len(zero_shot_examples):
|
| 90 |
+
zero_shot_examples = random.sample(zero_shot_examples, k=num_zero_shot_examples)
|
| 91 |
+
examples.extend(zero_shot_examples)
|
| 92 |
+
if num_few_shot_examples > 0:
|
| 93 |
+
with open(os.path.join(data_dir, "cot_fsopt.jsonl"), "r") as fin:
|
| 94 |
+
few_shot_examples = [json.loads(line) for line in fin]
|
| 95 |
+
if num_few_shot_examples < len(few_shot_examples):
|
| 96 |
+
few_shot_examples = random.sample(few_shot_examples, k=num_few_shot_examples)
|
| 97 |
+
examples.extend(few_shot_examples)
|
| 98 |
+
output_path = os.path.join(output_dir, "cot_data.jsonl")
|
| 99 |
+
with open(output_path, "w") as fout:
|
| 100 |
+
for idx, example in enumerate(examples):
|
| 101 |
+
prompt = example["inputs"]
|
| 102 |
+
if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
|
| 103 |
+
prompt += "\n"
|
| 104 |
+
completion = example["targets"]
|
| 105 |
+
fout.write(json.dumps({
|
| 106 |
+
"dataset": "cot",
|
| 107 |
+
"id": f"cot_{idx}",
|
| 108 |
+
"messages": [
|
| 109 |
+
{"role": "user", "content": prompt},
|
| 110 |
+
{"role": "assistant", "content": completion},
|
| 111 |
+
]
|
| 112 |
+
}) + "\n")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def convert_flan_v2_data(data_dir, output_dir):
|
| 116 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 117 |
+
examples = []
|
| 118 |
+
with open(os.path.join(data_dir, "flan_v2_resampled_100k.jsonl"), "r") as fin:
|
| 119 |
+
for line in fin:
|
| 120 |
+
examples.append(json.loads(line))
|
| 121 |
+
output_path = os.path.join(output_dir, "flan_v2_data.jsonl")
|
| 122 |
+
with open(output_path, "w") as fout:
|
| 123 |
+
for idx, example in enumerate(examples):
|
| 124 |
+
prompt = example["inputs"]
|
| 125 |
+
if not prompt.endswith("\n") and not prompt.rstrip().endswith(":"):
|
| 126 |
+
prompt += "\n"
|
| 127 |
+
completion = example["targets"]
|
| 128 |
+
fout.write(json.dumps({
|
| 129 |
+
"dataset": "flan_v2",
|
| 130 |
+
"id": f"flan_v2_{idx}",
|
| 131 |
+
"messages": [
|
| 132 |
+
{"role": "user", "content": prompt},
|
| 133 |
+
{"role": "assistant", "content": completion},
|
| 134 |
+
]
|
| 135 |
+
}) + "\n")
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def convert_dolly_data(data_dir, output_dir):
|
| 139 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 140 |
+
examples = []
|
| 141 |
+
with open(os.path.join(data_dir, "databricks-dolly-15k.jsonl"), "r") as fin:
|
| 142 |
+
for line in fin:
|
| 143 |
+
examples.append(json.loads(line))
|
| 144 |
+
output_path = os.path.join(output_dir, "dolly_data.jsonl")
|
| 145 |
+
with open(output_path, "w") as fout:
|
| 146 |
+
for idx, example in enumerate(examples):
|
| 147 |
+
encoded_example = encode_instruction_example(
|
| 148 |
+
instruction=example["instruction"],
|
| 149 |
+
input=example["context"],
|
| 150 |
+
output=example["response"],
|
| 151 |
+
random_template=True,
|
| 152 |
+
eos_token=None
|
| 153 |
+
)
|
| 154 |
+
fout.write(json.dumps({
|
| 155 |
+
"dataset": "dolly",
|
| 156 |
+
"id": f"dolly_{idx}",
|
| 157 |
+
"messages": [
|
| 158 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 159 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 160 |
+
]
|
| 161 |
+
}) + "\n")
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def convert_self_instruct_data(data_dir, output_dir):
|
| 165 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 166 |
+
examples = []
|
| 167 |
+
with open(os.path.join(data_dir, "all_instances_82K.jsonl"), "r") as fin:
|
| 168 |
+
for line in fin:
|
| 169 |
+
examples.append(json.loads(line))
|
| 170 |
+
output_path = os.path.join(output_dir, "self_instruct_data.jsonl")
|
| 171 |
+
with open(output_path, "w") as fout:
|
| 172 |
+
for idx, example in enumerate(examples):
|
| 173 |
+
encoded_example = encode_instruction_example(
|
| 174 |
+
instruction=example["instruction"],
|
| 175 |
+
input=example["input"],
|
| 176 |
+
output=example["output"],
|
| 177 |
+
random_template=True,
|
| 178 |
+
eos_token=None
|
| 179 |
+
)
|
| 180 |
+
fout.write(json.dumps({
|
| 181 |
+
"dataset": "self_instruct",
|
| 182 |
+
"id": f"self_instruct_{idx}",
|
| 183 |
+
"messages": [
|
| 184 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 185 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 186 |
+
]
|
| 187 |
+
}) + "\n")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def convert_unnatural_instructions_data(data_dir, output_dir):
|
| 191 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 192 |
+
instance_cnt = 0
|
| 193 |
+
with open(os.path.join(data_dir, "core_data.jsonl"), "r") as fin, open((os.path.join(output_dir, "unnatural_instructions_data.jsonl")), "w") as fout:
|
| 194 |
+
for line in fin:
|
| 195 |
+
task_data = json.loads(line)
|
| 196 |
+
instruction = task_data["instruction"]
|
| 197 |
+
for instance in task_data["instances"]:
|
| 198 |
+
if instance["constraints"] and instance["constraints"].lower() not in ["none", "none."]:
|
| 199 |
+
instance_instruction = instruction + "\n" + instance["constraints"]
|
| 200 |
+
else:
|
| 201 |
+
instance_instruction = instruction
|
| 202 |
+
encoded_example = encode_instruction_example(
|
| 203 |
+
instruction=instance_instruction,
|
| 204 |
+
input=instance["input"],
|
| 205 |
+
output=instance["output"],
|
| 206 |
+
random_template=True,
|
| 207 |
+
eos_token=None
|
| 208 |
+
)
|
| 209 |
+
fout.write(json.dumps({
|
| 210 |
+
"dataset": "unnatural_instructions",
|
| 211 |
+
"id": f"unnatural_instructions_{instance_cnt}",
|
| 212 |
+
"messages": [
|
| 213 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 214 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 215 |
+
]
|
| 216 |
+
}) + "\n")
|
| 217 |
+
instance_cnt += 1
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def convert_stanford_alpaca_data(data_dir, output_dir):
|
| 221 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 222 |
+
examples = []
|
| 223 |
+
with open(os.path.join(data_dir, "alpaca_data.json"), "r") as fin:
|
| 224 |
+
examples.extend(json.load(fin))
|
| 225 |
+
output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
|
| 226 |
+
with open(output_path, "w") as fout:
|
| 227 |
+
for idx, example in enumerate(examples):
|
| 228 |
+
encoded_example = encode_instruction_example(
|
| 229 |
+
instruction=example["instruction"],
|
| 230 |
+
input=example["input"],
|
| 231 |
+
output=example["output"],
|
| 232 |
+
random_template=True,
|
| 233 |
+
eos_token=None
|
| 234 |
+
)
|
| 235 |
+
fout.write(json.dumps({
|
| 236 |
+
"dataset": "stanford_alpaca",
|
| 237 |
+
"id": f"stanford_alpaca_{idx}",
|
| 238 |
+
"messages": [
|
| 239 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 240 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 241 |
+
]
|
| 242 |
+
}) + "\n")
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def convert_code_alpaca_data(data_dir, output_dir):
|
| 246 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 247 |
+
examples = []
|
| 248 |
+
with open(os.path.join(data_dir, "code_alpaca_20k.json"), "r") as fin:
|
| 249 |
+
examples.extend(json.load(fin))
|
| 250 |
+
output_path = os.path.join(output_dir, "code_alpaca_data.jsonl")
|
| 251 |
+
with open(output_path, "w") as fout:
|
| 252 |
+
for idx, example in enumerate(examples):
|
| 253 |
+
encoded_example = encode_instruction_example(
|
| 254 |
+
instruction=example["instruction"],
|
| 255 |
+
input=example["input"],
|
| 256 |
+
output=example["output"],
|
| 257 |
+
random_template=True,
|
| 258 |
+
eos_token=None
|
| 259 |
+
)
|
| 260 |
+
fout.write(json.dumps({
|
| 261 |
+
"dataset": "code_alpaca",
|
| 262 |
+
"id": f"code_alpaca_{idx}",
|
| 263 |
+
"messages": [
|
| 264 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 265 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 266 |
+
]
|
| 267 |
+
}) + "\n")
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
def convert_gpt4_alpaca_data(data_dir, output_dir, load_en=True, load_zh=False):
|
| 271 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 272 |
+
examples = []
|
| 273 |
+
if load_en:
|
| 274 |
+
with open(os.path.join(data_dir, "alpaca_gpt4_data.json"), "r") as fin:
|
| 275 |
+
examples.extend(json.load(fin))
|
| 276 |
+
if load_zh:
|
| 277 |
+
with open(os.path.join(data_dir, "alpaca_gpt4_data_zh.json"), "r") as fin:
|
| 278 |
+
examples.extend(json.load(fin))
|
| 279 |
+
output_path = os.path.join(output_dir, "gpt4_alpaca_data.jsonl")
|
| 280 |
+
with open(output_path, "w") as fout:
|
| 281 |
+
for idx, example in enumerate(examples):
|
| 282 |
+
encoded_example = encode_instruction_example(
|
| 283 |
+
instruction=example["instruction"],
|
| 284 |
+
input=example["input"],
|
| 285 |
+
output=example["output"],
|
| 286 |
+
random_template=True,
|
| 287 |
+
eos_token=None
|
| 288 |
+
)
|
| 289 |
+
fout.write(json.dumps({
|
| 290 |
+
"dataset": "gpt4_alpaca",
|
| 291 |
+
"id": f"gpt4_alpaca_{idx}",
|
| 292 |
+
"messages": [
|
| 293 |
+
{"role": "user", "content": encoded_example["prompt"]},
|
| 294 |
+
{"role": "assistant", "content": encoded_example["completion"]},
|
| 295 |
+
]
|
| 296 |
+
}) + "\n")
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def convert_sharegpt_data(data_dir, output_dir):
|
| 300 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 301 |
+
examples = []
|
| 302 |
+
with open(os.path.join(data_dir, "sharegpt_html_cleaned_and_split.json"), "r") as fin:
|
| 303 |
+
examples.extend(json.load(fin))
|
| 304 |
+
|
| 305 |
+
output_path = os.path.join(output_dir, "sharegpt_data.jsonl")
|
| 306 |
+
with open(output_path, "w") as fout:
|
| 307 |
+
invalid_cnt = 0
|
| 308 |
+
for idx, example in enumerate(examples):
|
| 309 |
+
messages = []
|
| 310 |
+
valid = True
|
| 311 |
+
for message in example["conversations"]:
|
| 312 |
+
if message["from"] == "human" or message["from"] == "user":
|
| 313 |
+
messages.append({
|
| 314 |
+
"role": "user",
|
| 315 |
+
"content": message["value"]
|
| 316 |
+
})
|
| 317 |
+
elif message["from"] == "gpt" or message["from"] == "chatgpt":
|
| 318 |
+
messages.append({
|
| 319 |
+
"role": "assistant",
|
| 320 |
+
"content": message["value"]
|
| 321 |
+
})
|
| 322 |
+
elif message["from"] == "system":
|
| 323 |
+
valid = False
|
| 324 |
+
invalid_cnt += 1
|
| 325 |
+
break
|
| 326 |
+
elif message["from"] == "bing":
|
| 327 |
+
valid = False
|
| 328 |
+
invalid_cnt += 1
|
| 329 |
+
break
|
| 330 |
+
else:
|
| 331 |
+
raise ValueError(f"Unknown message sender: {message['from']}")
|
| 332 |
+
if messages and valid:
|
| 333 |
+
fout.write(json.dumps({
|
| 334 |
+
"dataset": "sharegpt",
|
| 335 |
+
"id": f"sharegpt_{example['id']}",
|
| 336 |
+
"messages": messages
|
| 337 |
+
}) + "\n")
|
| 338 |
+
print(f"# of invalid examples in sharegpt data: {invalid_cnt}")
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def convert_baize_data(data_dir, output_dir):
|
| 342 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 343 |
+
examples = []
|
| 344 |
+
for source in ["alpaca", "medical", "quora", "stackoverflow"]:
|
| 345 |
+
with open(os.path.join(data_dir, f"{source}_chat_data.json"), "r") as fin:
|
| 346 |
+
examples.extend(json.load(fin))
|
| 347 |
+
|
| 348 |
+
output_path = os.path.join(output_dir, "baize_data.jsonl")
|
| 349 |
+
with open(output_path, "w") as fout:
|
| 350 |
+
for idx, example in enumerate(examples):
|
| 351 |
+
# split example["input"] by [|Human|] and [|AI|]
|
| 352 |
+
messages = []
|
| 353 |
+
rounds = example["input"].split("[|Human|]")[1:]
|
| 354 |
+
for round in rounds:
|
| 355 |
+
if not round.strip() or "[|AI|]" not in round:
|
| 356 |
+
continue
|
| 357 |
+
human, assistant = round.split("[|AI|]")
|
| 358 |
+
messages.append({
|
| 359 |
+
"role": "user",
|
| 360 |
+
"content": human.strip()
|
| 361 |
+
})
|
| 362 |
+
messages.append({
|
| 363 |
+
"role": "assistant",
|
| 364 |
+
"content": assistant.strip()
|
| 365 |
+
})
|
| 366 |
+
fout.write(json.dumps({
|
| 367 |
+
"dataset": "baize",
|
| 368 |
+
"id": f"baize_{idx}",
|
| 369 |
+
"messages": messages
|
| 370 |
+
}) + "\n")
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def convert_oasst1_data(data_dir, output_dir):
|
| 374 |
+
'''
|
| 375 |
+
For OASST1, because it's in a tree structure, where every user input might get multiple replies,
|
| 376 |
+
we have to save every path from the root node to the assistant reply (including both leaf node and intemediate node).
|
| 377 |
+
This results in some of the messages being duplicated among different paths (instances).
|
| 378 |
+
Be careful when using this dataset for training. Ideally, you should only minimize the loss of the last message in each path.
|
| 379 |
+
'''
|
| 380 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 381 |
+
conversations = []
|
| 382 |
+
with open(os.path.join(data_dir, "2023-04-12_oasst_ready.trees.jsonl"), "r") as fin:
|
| 383 |
+
for line in fin:
|
| 384 |
+
conversations.append(json.loads(line))
|
| 385 |
+
|
| 386 |
+
output_path = os.path.join(output_dir, "oasst1_data.jsonl")
|
| 387 |
+
|
| 388 |
+
# we filter out the sequences that mention the creator information
|
| 389 |
+
filter_strings = [
|
| 390 |
+
"LAION",
|
| 391 |
+
"Open Asssistant",
|
| 392 |
+
"OpenAssistant",
|
| 393 |
+
]
|
| 394 |
+
|
| 395 |
+
# tranvers the conversation tree, and collect all valid sequences
|
| 396 |
+
def dfs(reply, messages, valid_sequences):
|
| 397 |
+
if any([filter_string in reply["text"] for filter_string in filter_strings]):
|
| 398 |
+
return
|
| 399 |
+
if reply["role"] == "assistant":
|
| 400 |
+
messages.append(
|
| 401 |
+
{"role": "assistant", "content": reply["text"]}
|
| 402 |
+
)
|
| 403 |
+
if not reply["replies"]: # leaf node
|
| 404 |
+
valid_sequences.append(messages[:])
|
| 405 |
+
else:
|
| 406 |
+
for child in reply["replies"]:
|
| 407 |
+
dfs(child, messages, valid_sequences)
|
| 408 |
+
messages.pop()
|
| 409 |
+
elif reply["role"] == "prompter":
|
| 410 |
+
messages.append(
|
| 411 |
+
{"role": "user", "content": reply["text"]}
|
| 412 |
+
)
|
| 413 |
+
for child in reply["replies"]:
|
| 414 |
+
dfs(child, messages, valid_sequences)
|
| 415 |
+
messages.pop()
|
| 416 |
+
else:
|
| 417 |
+
raise ValueError(f"Unknown role: {reply['role']}")
|
| 418 |
+
|
| 419 |
+
with open(output_path, "w") as fout:
|
| 420 |
+
example_cnt = 0
|
| 421 |
+
for _, conversation in enumerate(conversations):
|
| 422 |
+
valid_sequences = []
|
| 423 |
+
dfs(conversation["prompt"], [], valid_sequences)
|
| 424 |
+
for sequence in valid_sequences:
|
| 425 |
+
fout.write(json.dumps({
|
| 426 |
+
"dataset": "oasst1",
|
| 427 |
+
"id": f"oasst1_{example_cnt}",
|
| 428 |
+
"messages": sequence
|
| 429 |
+
}) + "\n")
|
| 430 |
+
example_cnt += 1
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
def convert_lima_data(data_dir, output_dir):
|
| 434 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 435 |
+
examples = []
|
| 436 |
+
with open(os.path.join(data_dir, "train.jsonl"), "r") as fin:
|
| 437 |
+
for line in fin:
|
| 438 |
+
examples.append(json.loads(line))
|
| 439 |
+
output_path = os.path.join(output_dir, "lima_data.jsonl")
|
| 440 |
+
with open(output_path, "w") as fout:
|
| 441 |
+
for idx, example in enumerate(examples):
|
| 442 |
+
messages = []
|
| 443 |
+
if not len(example["conversations"]) % 2 == 0:
|
| 444 |
+
print(f"Waring: example {idx} in LIMA has odd number of messages. Cutting off the last message.")
|
| 445 |
+
example["conversations"] = example["conversations"][:-1]
|
| 446 |
+
|
| 447 |
+
for i in range(0, len(example["conversations"]), 2):
|
| 448 |
+
messages.append({
|
| 449 |
+
"role": "user",
|
| 450 |
+
"content": example["conversations"][i]
|
| 451 |
+
})
|
| 452 |
+
messages.append({
|
| 453 |
+
"role": "assistant",
|
| 454 |
+
"content": example["conversations"][i+1]
|
| 455 |
+
})
|
| 456 |
+
fout.write(json.dumps({
|
| 457 |
+
"dataset": "lima",
|
| 458 |
+
"id": f"lima_{idx}",
|
| 459 |
+
"messages": messages,
|
| 460 |
+
}) + "\n")
|
| 461 |
+
|
| 462 |
+
|
| 463 |
+
def convert_wizardlm_data(data_dir, output_dir):
|
| 464 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 465 |
+
examples = []
|
| 466 |
+
with open(os.path.join(data_dir, "WizardLM_evol_instruct_V2_143k.json"), "r") as fin:
|
| 467 |
+
examples = json.load(fin)
|
| 468 |
+
|
| 469 |
+
output_path = os.path.join(output_dir, "wizardlm_data.jsonl")
|
| 470 |
+
with open(output_path, "w") as fout:
|
| 471 |
+
for idx, example in enumerate(examples):
|
| 472 |
+
messages = []
|
| 473 |
+
assert len(example["conversations"]) % 2 == 0
|
| 474 |
+
for i in range(0, len(example["conversations"]), 2):
|
| 475 |
+
assert example["conversations"][i]["from"] == "human"
|
| 476 |
+
assert example["conversations"][i+1]["from"] == "gpt"
|
| 477 |
+
messages.append({
|
| 478 |
+
"role": "user",
|
| 479 |
+
"content": example["conversations"][i]["value"]
|
| 480 |
+
})
|
| 481 |
+
messages.append({
|
| 482 |
+
"role": "assistant",
|
| 483 |
+
"content": example["conversations"][i+1]["value"]
|
| 484 |
+
})
|
| 485 |
+
fout.write(json.dumps({
|
| 486 |
+
"dataset": "wizardlm",
|
| 487 |
+
"id": f"wizardlm_{example['idx']}",
|
| 488 |
+
"messages": messages,
|
| 489 |
+
}) + "\n")
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
def convert_open_orca_data(data_dir, output_dir, num_gpt4_examples=100000, num_gpt35_examples=0):
|
| 493 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 494 |
+
examples = []
|
| 495 |
+
|
| 496 |
+
df = pd.read_parquet(os.path.join(data_dir, "1M-GPT4-Augmented.parquet"))
|
| 497 |
+
gpt4_examples = [row.to_dict() for _, row in df.iterrows()]
|
| 498 |
+
random.shuffle(gpt4_examples)
|
| 499 |
+
examples.extend(gpt4_examples[:num_gpt4_examples])
|
| 500 |
+
|
| 501 |
+
df = pd.read_parquet(os.path.join(data_dir, "3_5M-GPT3_5-Augmented.parquet"))
|
| 502 |
+
gpt35_examples = [row.to_dict() for _, row in df.iterrows()]
|
| 503 |
+
random.shuffle(gpt35_examples)
|
| 504 |
+
examples.extend(gpt35_examples[:num_gpt35_examples])
|
| 505 |
+
|
| 506 |
+
output_path = os.path.join(output_dir, "open_orca_data.jsonl")
|
| 507 |
+
with open(output_path, "w") as fout:
|
| 508 |
+
for idx, example in enumerate(examples):
|
| 509 |
+
messages = [
|
| 510 |
+
{"role": "system", "content": example["system_prompt"]},
|
| 511 |
+
{"role": "user", "content": example["question"]},
|
| 512 |
+
{"role": "assistant", "content": example["response"]}
|
| 513 |
+
]
|
| 514 |
+
fout.write(json.dumps({
|
| 515 |
+
"dataset": "open_orca",
|
| 516 |
+
"id": f"open_orca_{example['id']}",
|
| 517 |
+
"messages": messages,
|
| 518 |
+
}) + "\n")
|
| 519 |
+
|
| 520 |
+
|
| 521 |
+
if __name__ == "__main__":
|
| 522 |
+
arg_parser = argparse.ArgumentParser()
|
| 523 |
+
arg_parser.add_argument("--raw_data_dir", type=str, default="data/downloads")
|
| 524 |
+
arg_parser.add_argument("--output_dir", type=str, default="data/processed")
|
| 525 |
+
arg_parser.add_argument("--seed", type=int, default=42)
|
| 526 |
+
args = arg_parser.parse_args()
|
| 527 |
+
random.seed(args.seed)
|
| 528 |
+
|
| 529 |
+
# get the subfolder names in raw_data_dir
|
| 530 |
+
subfolders = [f for f in os.listdir(args.raw_data_dir) if os.path.isdir(os.path.join(args.raw_data_dir, f))]
|
| 531 |
+
|
| 532 |
+
# all supported datasets
|
| 533 |
+
supported_datasets = []
|
| 534 |
+
all_funcs = [func_name for func_name in globals() if callable(globals()[func_name])]
|
| 535 |
+
for func_name in all_funcs:
|
| 536 |
+
if re.match(r"convert_.+_data", func_name):
|
| 537 |
+
supported_datasets.append(func_name[8:-5])
|
| 538 |
+
|
| 539 |
+
# check if the subfolder names are supported datasets
|
| 540 |
+
valid_subfolders = []
|
| 541 |
+
for subfolder in subfolders:
|
| 542 |
+
if subfolder not in supported_datasets:
|
| 543 |
+
print(f"Warning: {subfolder} in the raw data folder is not a supported dataset. We will skip it.")
|
| 544 |
+
else:
|
| 545 |
+
valid_subfolders.append(subfolder)
|
| 546 |
+
|
| 547 |
+
# prepare data for each dataset
|
| 548 |
+
statistics = {}
|
| 549 |
+
for subfolder in valid_subfolders:
|
| 550 |
+
print(f"Processing {subfolder} data...")
|
| 551 |
+
globals()[f"convert_{subfolder}_data"](os.path.join(args.raw_data_dir, subfolder), os.path.join(args.output_dir, subfolder))
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
pdfplumber
|
| 3 |
+
docx2txt
|
| 4 |
+
justext
|
| 5 |
+
openai
|
| 6 |
+
langchain
|
web-app.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
| 3 |
+
from langchain.vectorstores import FAISS, Chroma
|
| 4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
| 5 |
+
from langchain.llms import OpenAI as OpenAI_llm
|
| 6 |
+
from langchain.chat_models import ChatOpenAI
|
| 7 |
+
from langchain.chains import ConversationalRetrievalChain,RetrievalQA
|
| 8 |
+
from langchain.memory import ConversationBufferMemory
|
| 9 |
+
from langchain.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
|
| 10 |
+
from langchain.prompts.chat import ChatPromptTemplate,HumanMessagePromptTemplate,SystemMessagePromptTemplate
|
| 11 |
+
# from langchain.chains.qa_with_sources import load_qa_with_sources_chain,BaseCombineDocumentsChain
|
| 12 |
+
import os
|
| 13 |
+
import chromadb
|
| 14 |
+
import tempfile
|
| 15 |
+
import requests
|
| 16 |
+
import openai
|
| 17 |
+
from bs4 import BeautifulSoup
|
| 18 |
+
from urllib.parse import urlparse
|
| 19 |
+
|
| 20 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 21 |
+
|
| 22 |
+
def assistant(url):
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
question=st.text_input("Ask your Question")
|
| 26 |
+
|
| 27 |
+
if st.button("Submit",type="primary"):
|
| 28 |
+
ABS_PATH: str = os.path.dirname(os.path.abspath(__file__))
|
| 29 |
+
DB_DIR: str = os.path.join(ABS_PATH,"db")
|
| 30 |
+
|
| 31 |
+
loader=WebBaseLoader(url)
|
| 32 |
+
data=loader.load()
|
| 33 |
+
|
| 34 |
+
text_splitter = CharacterTextSplitter(separator='\n',
|
| 35 |
+
chunk_size=1000,chunk_overlap=0)
|
| 36 |
+
|
| 37 |
+
docs = text_splitter.split_documents(data)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
openai_embeddings = OpenAIEmbeddings()
|
| 41 |
+
|
| 42 |
+
# client = chromadb.PersistentClient(path=DB_DIR)
|
| 43 |
+
vectordb = FAISS.from_documents(documents=docs,embedding=openai_embeddings)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# vectordb.persist()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
retriever=vectordb.as_retriever()
|
| 50 |
+
|
| 51 |
+
llm=ChatOpenAI(model_name='gpt-3.5-turbo')
|
| 52 |
+
|
| 53 |
+
qa=RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
response=qa(question)
|
| 57 |
+
|
| 58 |
+
st.write(response)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
st.title('Chat with Website')
|
| 63 |
+
|
| 64 |
+
url=st.text_input('Enter Your URL here:')
|
| 65 |
+
|
| 66 |
+
if url:
|
| 67 |
+
assistant(url)
|