Upload 27 files
Browse files- .gitattributes +2 -0
- .gitignore +119 -0
- LICENSE +201 -0
- assets/indexing_stage_positive_thinking_company.png +0 -0
- assets/multi_index_queries_positive_thinking_company.png +3 -0
- assets/rag_overview_positive_thinking_company.png +3 -0
- multi_index_demo/__init__.py +1 -0
- multi_index_demo/app.py +61 -0
- multi_index_demo/app_config.yaml +29 -0
- multi_index_demo/config.py +86 -0
- multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Anna Mustermann.pdf +0 -0
- multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae John Doe.pdf +0 -0
- multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Max Mustermann.pdf +0 -0
- multi_index_demo/esco_skill_graph/esco_skill_extractor.py +69 -0
- multi_index_demo/esco_skill_graph/esco_skill_graph.py +89 -0
- multi_index_demo/esco_skill_graph/esco_skill_mapping.py +67 -0
- multi_index_demo/esco_skill_graph/skill_list_prompt.txt +18 -0
- multi_index_demo/indexing_utils.py +107 -0
- multi_index_demo/paths.py +3 -0
- multi_index_demo/query_executers.py +98 -0
- multi_index_demo/response_clustering.py +74 -0
- multi_index_demo/streamlit_utils.py +51 -0
- multi_index_demo/style.css +15 -0
- poetry.lock +0 -0
- pyproject.toml +34 -0
- requirements.txt +13 -0
- tests/__init__.py +0 -0
- tests/test_version.py +5 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/multi_index_queries_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/rag_overview_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Data
|
| 10 |
+
*.json
|
| 11 |
+
*.jsonl
|
| 12 |
+
*.pickle
|
| 13 |
+
*.xlsx
|
| 14 |
+
|
| 15 |
+
# Distribution / packaging
|
| 16 |
+
.Python
|
| 17 |
+
build/
|
| 18 |
+
develop-eggs/
|
| 19 |
+
dist/
|
| 20 |
+
downloads/
|
| 21 |
+
eggs/
|
| 22 |
+
.eggs/
|
| 23 |
+
lib/
|
| 24 |
+
lib64/
|
| 25 |
+
parts/
|
| 26 |
+
sdist/
|
| 27 |
+
var/
|
| 28 |
+
wheels/
|
| 29 |
+
*.egg-info/
|
| 30 |
+
.installed.cfg
|
| 31 |
+
*.egg
|
| 32 |
+
MANIFEST
|
| 33 |
+
|
| 34 |
+
# PyInstaller
|
| 35 |
+
# Usually these files are written by a python script from a template
|
| 36 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 37 |
+
*.manifest
|
| 38 |
+
*.spec
|
| 39 |
+
|
| 40 |
+
# Installer logs
|
| 41 |
+
pip-log.txt
|
| 42 |
+
pip-delete-this-directory.txt
|
| 43 |
+
|
| 44 |
+
# Unit test / coverage reports
|
| 45 |
+
htmlcov/
|
| 46 |
+
.tox/
|
| 47 |
+
.coverage
|
| 48 |
+
.coverage.*
|
| 49 |
+
.cache
|
| 50 |
+
nosetests.xml
|
| 51 |
+
coverage.xml
|
| 52 |
+
*.cover
|
| 53 |
+
.hypothesis/
|
| 54 |
+
.pytest_cache/
|
| 55 |
+
|
| 56 |
+
# Translations
|
| 57 |
+
*.mo
|
| 58 |
+
*.pot
|
| 59 |
+
|
| 60 |
+
# Django stuff:
|
| 61 |
+
#*.log
|
| 62 |
+
local_settings.py
|
| 63 |
+
db.sqlite3
|
| 64 |
+
|
| 65 |
+
# Flask stuff:
|
| 66 |
+
instance/
|
| 67 |
+
.webassets-cache
|
| 68 |
+
|
| 69 |
+
# Scrapy stuff:
|
| 70 |
+
.scrapy
|
| 71 |
+
|
| 72 |
+
# Sphinx documentation
|
| 73 |
+
docs/_build/
|
| 74 |
+
|
| 75 |
+
# PyBuilder
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# pyenv
|
| 82 |
+
.python-version
|
| 83 |
+
|
| 84 |
+
# celery beat schedule file
|
| 85 |
+
celerybeat-schedule
|
| 86 |
+
|
| 87 |
+
# SageMath parsed files
|
| 88 |
+
*.sage.py
|
| 89 |
+
|
| 90 |
+
# Environments
|
| 91 |
+
.env
|
| 92 |
+
.venv
|
| 93 |
+
env/
|
| 94 |
+
venv/
|
| 95 |
+
ENV/
|
| 96 |
+
env.bak/
|
| 97 |
+
venv.bak/
|
| 98 |
+
|
| 99 |
+
# Spyder project settings
|
| 100 |
+
.spyderproject
|
| 101 |
+
.spyproject
|
| 102 |
+
|
| 103 |
+
# Rope project settings
|
| 104 |
+
.ropeproject
|
| 105 |
+
|
| 106 |
+
# mkdocs documentation
|
| 107 |
+
/site
|
| 108 |
+
|
| 109 |
+
# mypy
|
| 110 |
+
.mypy_cache/
|
| 111 |
+
|
| 112 |
+
# idea
|
| 113 |
+
.idea/*
|
| 114 |
+
.idea
|
| 115 |
+
idea/
|
| 116 |
+
|
| 117 |
+
# Exclude model.bin files from repo
|
| 118 |
+
*.bin
|
| 119 |
+
*vocab.txt
|
LICENSE
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Apache License
|
| 2 |
+
Version 2.0, January 2004
|
| 3 |
+
http://www.apache.org/licenses/
|
| 4 |
+
|
| 5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 6 |
+
|
| 7 |
+
1. Definitions.
|
| 8 |
+
|
| 9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 11 |
+
|
| 12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 13 |
+
the copyright owner that is granting the License.
|
| 14 |
+
|
| 15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 16 |
+
other entities that control, are controlled by, or are under common
|
| 17 |
+
control with that entity. For the purposes of this definition,
|
| 18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 19 |
+
direction or management of such entity, whether by contract or
|
| 20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 22 |
+
|
| 23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 24 |
+
exercising permissions granted by this License.
|
| 25 |
+
|
| 26 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 27 |
+
including but not limited to software source code, documentation
|
| 28 |
+
source, and configuration files.
|
| 29 |
+
|
| 30 |
+
"Object" form shall mean any form resulting from mechanical
|
| 31 |
+
transformation or translation of a Source form, including but
|
| 32 |
+
not limited to compiled object code, generated documentation,
|
| 33 |
+
and conversions to other media types.
|
| 34 |
+
|
| 35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 36 |
+
Object form, made available under the License, as indicated by a
|
| 37 |
+
copyright notice that is included in or attached to the work
|
| 38 |
+
(an example is provided in the Appendix below).
|
| 39 |
+
|
| 40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 41 |
+
form, that is based on (or derived from) the Work and for which the
|
| 42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 44 |
+
of this License, Derivative Works shall not include works that remain
|
| 45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 46 |
+
the Work and Derivative Works thereof.
|
| 47 |
+
|
| 48 |
+
"Contribution" shall mean any work of authorship, including
|
| 49 |
+
the original version of the Work and any modifications or additions
|
| 50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 54 |
+
means any form of electronic, verbal, or written communication sent
|
| 55 |
+
to the Licensor or its representatives, including but not limited to
|
| 56 |
+
communication on electronic mailing lists, source code control systems,
|
| 57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 59 |
+
excluding communication that is conspicuously marked or otherwise
|
| 60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 61 |
+
|
| 62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 64 |
+
subsequently incorporated within the Work.
|
| 65 |
+
|
| 66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 71 |
+
Work and such Derivative Works in Source or Object form.
|
| 72 |
+
|
| 73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 76 |
+
(except as stated in this section) patent license to make, have made,
|
| 77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 78 |
+
where such license applies only to those patent claims licensable
|
| 79 |
+
by such Contributor that are necessarily infringed by their
|
| 80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 82 |
+
institute patent litigation against any entity (including a
|
| 83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 84 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 85 |
+
or contributory patent infringement, then any patent licenses
|
| 86 |
+
granted to You under this License for that Work shall terminate
|
| 87 |
+
as of the date such litigation is filed.
|
| 88 |
+
|
| 89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 90 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 91 |
+
modifications, and in Source or Object form, provided that You
|
| 92 |
+
meet the following conditions:
|
| 93 |
+
|
| 94 |
+
(a) You must give any other recipients of the Work or
|
| 95 |
+
Derivative Works a copy of this License; and
|
| 96 |
+
|
| 97 |
+
(b) You must cause any modified files to carry prominent notices
|
| 98 |
+
stating that You changed the files; and
|
| 99 |
+
|
| 100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 101 |
+
that You distribute, all copyright, patent, trademark, and
|
| 102 |
+
attribution notices from the Source form of the Work,
|
| 103 |
+
excluding those notices that do not pertain to any part of
|
| 104 |
+
the Derivative Works; and
|
| 105 |
+
|
| 106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 107 |
+
distribution, then any Derivative Works that You distribute must
|
| 108 |
+
include a readable copy of the attribution notices contained
|
| 109 |
+
within such NOTICE file, excluding those notices that do not
|
| 110 |
+
pertain to any part of the Derivative Works, in at least one
|
| 111 |
+
of the following places: within a NOTICE text file distributed
|
| 112 |
+
as part of the Derivative Works; within the Source form or
|
| 113 |
+
documentation, if provided along with the Derivative Works; or,
|
| 114 |
+
within a display generated by the Derivative Works, if and
|
| 115 |
+
wherever such third-party notices normally appear. The contents
|
| 116 |
+
of the NOTICE file are for informational purposes only and
|
| 117 |
+
do not modify the License. You may add Your own attribution
|
| 118 |
+
notices within Derivative Works that You distribute, alongside
|
| 119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 120 |
+
that such additional attribution notices cannot be construed
|
| 121 |
+
as modifying the License.
|
| 122 |
+
|
| 123 |
+
You may add Your own copyright statement to Your modifications and
|
| 124 |
+
may provide additional or different license terms and conditions
|
| 125 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 126 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 127 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 128 |
+
the conditions stated in this License.
|
| 129 |
+
|
| 130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 132 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 133 |
+
this License, without any additional terms or conditions.
|
| 134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 135 |
+
the terms of any separate license agreement you may have executed
|
| 136 |
+
with Licensor regarding such Contributions.
|
| 137 |
+
|
| 138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 140 |
+
except as required for reasonable and customary use in describing the
|
| 141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 142 |
+
|
| 143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 144 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 147 |
+
implied, including, without limitation, any warranties or conditions
|
| 148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 150 |
+
appropriateness of using or redistributing the Work and assume any
|
| 151 |
+
risks associated with Your exercise of permissions under this License.
|
| 152 |
+
|
| 153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 154 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 155 |
+
unless required by applicable law (such as deliberate and grossly
|
| 156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 157 |
+
liable to You for damages, including any direct, indirect, special,
|
| 158 |
+
incidental, or consequential damages of any character arising as a
|
| 159 |
+
result of this License or out of the use or inability to use the
|
| 160 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 161 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 162 |
+
other commercial damages or losses), even if such Contributor
|
| 163 |
+
has been advised of the possibility of such damages.
|
| 164 |
+
|
| 165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 168 |
+
or other liability obligations and/or rights consistent with this
|
| 169 |
+
License. However, in accepting such obligations, You may act only
|
| 170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 171 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 172 |
+
defend, and hold each Contributor harmless for any liability
|
| 173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 174 |
+
of your accepting any such warranty or additional liability.
|
| 175 |
+
|
| 176 |
+
END OF TERMS AND CONDITIONS
|
| 177 |
+
|
| 178 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 179 |
+
|
| 180 |
+
To apply the Apache License to your work, attach the following
|
| 181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 182 |
+
replaced with your own identifying information. (Don't include
|
| 183 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 184 |
+
comment syntax for the file format. We also recommend that a
|
| 185 |
+
file or class name and description of purpose be included on the
|
| 186 |
+
same "printed page" as the copyright notice for easier
|
| 187 |
+
identification within third-party archives.
|
| 188 |
+
|
| 189 |
+
Copyright [yyyy] [name of copyright owner]
|
| 190 |
+
|
| 191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 192 |
+
you may not use this file except in compliance with the License.
|
| 193 |
+
You may obtain a copy of the License at
|
| 194 |
+
|
| 195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 196 |
+
|
| 197 |
+
Unless required by applicable law or agreed to in writing, software
|
| 198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 200 |
+
See the License for the specific language governing permissions and
|
| 201 |
+
limitations under the License.
|
assets/indexing_stage_positive_thinking_company.png
ADDED
|
assets/multi_index_queries_positive_thinking_company.png
ADDED
|
Git LFS Details
|
assets/rag_overview_positive_thinking_company.png
ADDED
|
Git LFS Details
|
multi_index_demo/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
__version__ = "0.1.2"
|
multi_index_demo/app.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
from streamlit_utils import local_css, remote_css, load_pdf_files
|
| 6 |
+
from config import load_app_config, set_global_api_key
|
| 7 |
+
from indexing_utils import ServiceContextLoader, create_multi_index
|
| 8 |
+
from query_executers import QueryExecuter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main():
|
| 12 |
+
"""
|
| 13 |
+
This is the main method of the streamlit application
|
| 14 |
+
"""
|
| 15 |
+
# Set the OpenAI Api key as an environment variable
|
| 16 |
+
set_global_api_key()
|
| 17 |
+
|
| 18 |
+
dirname = Path(os.path.dirname(__file__))
|
| 19 |
+
local_css((dirname /"style.css").as_posix())
|
| 20 |
+
remote_css('https://fonts.googleapis.com/icon?family=Material+Icons')
|
| 21 |
+
|
| 22 |
+
# Load a Configuration object for the application
|
| 23 |
+
app_config = load_app_config()
|
| 24 |
+
|
| 25 |
+
# Initialize a ServiceContext for the QueryEngine
|
| 26 |
+
service_context = ServiceContextLoader(app_config=app_config).load()
|
| 27 |
+
|
| 28 |
+
# Initialize a simple SentenceTransformer model for clustering the final responses
|
| 29 |
+
sbert_model = SentenceTransformer(app_config.ClusteringConfig.SentenceTransformerModel)
|
| 30 |
+
|
| 31 |
+
st.title("Parallel Multi-Document Question Answering")
|
| 32 |
+
|
| 33 |
+
# Provide a file_uploader with drag and drop functionality
|
| 34 |
+
multiple_files = st.file_uploader(
|
| 35 |
+
"Drop multiple files:", accept_multiple_files=True
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if multiple_files is None:
|
| 39 |
+
st.text("No upload")
|
| 40 |
+
else:
|
| 41 |
+
files = [file for file in multiple_files if str(file.name).endswith(".pdf")]
|
| 42 |
+
|
| 43 |
+
# Load the pdf files based on the file objects
|
| 44 |
+
file_content_list = load_pdf_files(files=files)
|
| 45 |
+
|
| 46 |
+
if file_content_list:
|
| 47 |
+
top_k = app_config.QueryEngineConfig.similarity_top_k
|
| 48 |
+
|
| 49 |
+
# Create a multi-index query engine based on the pdf file content
|
| 50 |
+
multi_index_query_engine = create_multi_index(file_content_list=file_content_list,
|
| 51 |
+
_service_context=service_context,
|
| 52 |
+
top_k=top_k)
|
| 53 |
+
|
| 54 |
+
# Execute the query and display the results in the streamlit app
|
| 55 |
+
query_executer = QueryExecuter(query_engine=multi_index_query_engine,
|
| 56 |
+
sbert_model=sbert_model,
|
| 57 |
+
config=app_config)
|
| 58 |
+
query_executer.run()
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
main()
|
multi_index_demo/app_config.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configure the Large Language Model Api calls
|
| 2 |
+
LLMConfig:
|
| 3 |
+
temperature: 0
|
| 4 |
+
model: "text-davinci-003"
|
| 5 |
+
max_tokens: 250
|
| 6 |
+
# Configure how the pdf file texts should be chunked into Node texts
|
| 7 |
+
SimpleNodeParser:
|
| 8 |
+
chunk_size: 1024
|
| 9 |
+
chunk_overlap: 20
|
| 10 |
+
# Specify how many contexts to retrieve in the retrieval step
|
| 11 |
+
QueryEngineConfig:
|
| 12 |
+
similarity_top_k: 3
|
| 13 |
+
# Configure the prompt for response synthesis
|
| 14 |
+
PromptHelper:
|
| 15 |
+
context_window: 4096
|
| 16 |
+
chunk_overlap_ratio: 0.1
|
| 17 |
+
chunk_size_limit: null
|
| 18 |
+
# Specify the SentenceTransformer model for semantic clustering of responses
|
| 19 |
+
ClusteringConfig:
|
| 20 |
+
SentenceTransformerModel: "all-mpnet-base-v2"
|
| 21 |
+
# Specify the Config for the Esco Faiss index skill query API
|
| 22 |
+
EscoSkillApiConfig:
|
| 23 |
+
index_name: esco_skill_index
|
| 24 |
+
top_k: 1
|
| 25 |
+
# This is a config for the Api to extract a semicolon separated list of skills from a Skill description text
|
| 26 |
+
SkillsToListConfig:
|
| 27 |
+
llm_model_name: "text-davinci-003"
|
| 28 |
+
temperature: 0.0
|
| 29 |
+
prompt_template: skill_list_prompt.txt
|
multi_index_demo/config.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from typing import Optional
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import openai
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
class LLMConfig(BaseModel):
|
| 10 |
+
"""
|
| 11 |
+
Configuration of the LLM prompt parameters
|
| 12 |
+
"""
|
| 13 |
+
temperature: float
|
| 14 |
+
model: str
|
| 15 |
+
max_tokens: int
|
| 16 |
+
|
| 17 |
+
class SimpleNodeParser(BaseModel):
|
| 18 |
+
"""
|
| 19 |
+
Configuration of the simple Document to Node parser
|
| 20 |
+
"""
|
| 21 |
+
chunk_size: int
|
| 22 |
+
chunk_overlap: int
|
| 23 |
+
|
| 24 |
+
class PromptHelper(BaseModel):
|
| 25 |
+
"""
|
| 26 |
+
This is a Prompt Helper config which configures actual constraints of the prompt context size
|
| 27 |
+
and overlap between chunks
|
| 28 |
+
"""
|
| 29 |
+
context_window: int
|
| 30 |
+
chunk_overlap_ratio: float
|
| 31 |
+
chunk_size_limit: Optional[int]
|
| 32 |
+
|
| 33 |
+
class QueryEngineConfig(BaseModel):
|
| 34 |
+
"""
|
| 35 |
+
Configuration of the query engine
|
| 36 |
+
"""
|
| 37 |
+
similarity_top_k: int
|
| 38 |
+
|
| 39 |
+
class ClusteringConfig(BaseModel):
|
| 40 |
+
"""
|
| 41 |
+
Configuration of the SentenceTransformer Embedding model for cluster analysis
|
| 42 |
+
"""
|
| 43 |
+
SentenceTransformerModel: str
|
| 44 |
+
|
| 45 |
+
class EscoSkillApiConfig(BaseModel):
|
| 46 |
+
index_name: str
|
| 47 |
+
top_k: int
|
| 48 |
+
|
| 49 |
+
class SkillsToListConfig(BaseModel):
|
| 50 |
+
llm_model_name: str
|
| 51 |
+
temperature: float
|
| 52 |
+
prompt_template: str
|
| 53 |
+
|
| 54 |
+
class AppConfig(BaseModel):
|
| 55 |
+
"""
|
| 56 |
+
This is a basic config object for the Streamlit Application
|
| 57 |
+
"""
|
| 58 |
+
LLMConfig: LLMConfig
|
| 59 |
+
PromptHelper: PromptHelper
|
| 60 |
+
SimpleNodeParser: SimpleNodeParser
|
| 61 |
+
QueryEngineConfig: QueryEngineConfig
|
| 62 |
+
ClusteringConfig: ClusteringConfig
|
| 63 |
+
EscoSkillApiConfig: EscoSkillApiConfig
|
| 64 |
+
SkillsToListConfig: SkillsToListConfig
|
| 65 |
+
|
| 66 |
+
def load_app_config() -> AppConfig:
|
| 67 |
+
"""
|
| 68 |
+
This method loads the AppConfig object
|
| 69 |
+
|
| 70 |
+
:return: Initialized AppConfig object
|
| 71 |
+
"""
|
| 72 |
+
dirname = Path(os.path.dirname(__file__))
|
| 73 |
+
with (dirname / "app_config.yaml").open("r", encoding="utf-8") as f:
|
| 74 |
+
app_config_dict = yaml.safe_load(f)
|
| 75 |
+
|
| 76 |
+
app_config_obj = AppConfig(**app_config_dict)
|
| 77 |
+
|
| 78 |
+
return app_config_obj
|
| 79 |
+
|
| 80 |
+
def set_global_api_key():
|
| 81 |
+
"""
|
| 82 |
+
This method sets the API key globally
|
| 83 |
+
"""
|
| 84 |
+
load_dotenv()
|
| 85 |
+
|
| 86 |
+
openai.api_key = os.environ["OPENAI_API_KEY"]
|
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Anna Mustermann.pdf
ADDED
|
Binary file (78.8 kB). View file
|
|
|
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae John Doe.pdf
ADDED
|
Binary file (78.8 kB). View file
|
|
|
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Max Mustermann.pdf
ADDED
|
Binary file (79.2 kB). View file
|
|
|
multi_index_demo/esco_skill_graph/esco_skill_extractor.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain import PromptTemplate
|
| 2 |
+
from langchain.llms import OpenAI
|
| 3 |
+
import os
|
| 4 |
+
from config import set_global_api_key
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from typing import List
|
| 7 |
+
from paths import REPO_DIR_PATH
|
| 8 |
+
|
| 9 |
+
set_global_api_key()
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SkillsToList:
|
| 13 |
+
"""
|
| 14 |
+
This is a query engine which does extract skills from plain text skill descriptions
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, model_name: str, temperature: float, prompt_template_path: str):
|
| 17 |
+
"""
|
| 18 |
+
:param model_name: Name of the LLM to be used for prompting
|
| 19 |
+
:param temperature: Temperature prompt parameter, high - explorative, low - conservative
|
| 20 |
+
:param prompt_template_path: Filepath to the prompt template
|
| 21 |
+
"""
|
| 22 |
+
# initialize the LLM Api
|
| 23 |
+
self.openai_engine = OpenAI(
|
| 24 |
+
model_name=model_name,
|
| 25 |
+
openai_api_key=os.environ["OPENAI_API_KEY"],
|
| 26 |
+
temperature=temperature
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
# Read a prompt template for skill extraction from a list of skills
|
| 30 |
+
prompt_path = REPO_DIR_PATH / "esco_skill_graph" / prompt_template_path
|
| 31 |
+
with prompt_path.open("r", encoding="utf-8") as f:
|
| 32 |
+
template = f.read()
|
| 33 |
+
|
| 34 |
+
# Create a prompt template
|
| 35 |
+
self.prompt_template = PromptTemplate(
|
| 36 |
+
input_variables=["context"],
|
| 37 |
+
template=template
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
def extract_skill_list(self, skill_description: str) -> List[str]:
|
| 41 |
+
""""
|
| 42 |
+
This method extracts a list of skills
|
| 43 |
+
|
| 44 |
+
:param skill_description: A Descriptive text outlining Soft-skills & Hard-Skills
|
| 45 |
+
|
| 46 |
+
:return: List of skills
|
| 47 |
+
"""
|
| 48 |
+
prompt = self.prompt_template.format(context=skill_description)
|
| 49 |
+
|
| 50 |
+
result = self.openai_engine(prompt)
|
| 51 |
+
|
| 52 |
+
skills = [skill.strip() for skill in result.strip().split(";")]
|
| 53 |
+
|
| 54 |
+
return skills
|
| 55 |
+
|
| 56 |
+
def __call__(self, skill_descriptions: List[str]) -> List[List[str]]:
|
| 57 |
+
"""
|
| 58 |
+
This method returns lists of separate skills
|
| 59 |
+
|
| 60 |
+
:param skill_descriptions: List of plain text skill descriptions
|
| 61 |
+
|
| 62 |
+
:return: Lists of skills
|
| 63 |
+
"""
|
| 64 |
+
skill_lists = []
|
| 65 |
+
for skill_desc in skill_descriptions:
|
| 66 |
+
skills = self.extract_skill_list(skill_description=skill_desc)
|
| 67 |
+
skill_lists.append(skills)
|
| 68 |
+
|
| 69 |
+
return skill_lists
|
multi_index_demo/esco_skill_graph/esco_skill_graph.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import networkx as nx
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import matplotlib
|
| 5 |
+
import streamlit as st
|
| 6 |
+
from typing import List, Dict, Union, Set
|
| 7 |
+
from matplotlib.pyplot import figure, text
|
| 8 |
+
|
| 9 |
+
class SkillGraph:
|
| 10 |
+
"""
|
| 11 |
+
This is a Graph for visualizing the overlap of skills and competencies of different Candidates
|
| 12 |
+
"""
|
| 13 |
+
def __init__(self, graph_type: str="spring"):
|
| 14 |
+
assert graph_type in ["spring", "shell"], "graph type must be 'spring' or 'shell'"
|
| 15 |
+
self.graph_type = graph_type
|
| 16 |
+
|
| 17 |
+
def get_edge_df(self, skill_lists: List[Dict], unique_skills: Set) -> pd.DataFrame:
|
| 18 |
+
"""
|
| 19 |
+
This method returns a DataFrame with the Edges of the Graph
|
| 20 |
+
"""
|
| 21 |
+
relationship_list = []
|
| 22 |
+
for skill_dict in skill_lists:
|
| 23 |
+
candidate_id = skill_dict.get("id")
|
| 24 |
+
intersections = unique_skills.intersection(set(skill_dict.get("skills")))
|
| 25 |
+
for skill in list(set(intersections)):
|
| 26 |
+
relationship_list.append({"from": candidate_id, "to": skill})
|
| 27 |
+
|
| 28 |
+
relationships = pd.DataFrame(relationship_list)
|
| 29 |
+
return relationships
|
| 30 |
+
|
| 31 |
+
def get_node_df(self, unique_skills: Set, skill_lists: List[Dict]) -> pd.DataFrame:
|
| 32 |
+
"""
|
| 33 |
+
This method returns a DataFrame with the nodes of the Graph
|
| 34 |
+
"""
|
| 35 |
+
candidate_nodes = [{"ID": skill_dict.get("id"), "type": "candidate"} for skill_dict in skill_lists]
|
| 36 |
+
skill_nodes = [{"ID": skill, "type": "skill"} for skill in unique_skills]
|
| 37 |
+
nodes = candidate_nodes + skill_nodes
|
| 38 |
+
carac = pd.DataFrame(nodes)
|
| 39 |
+
return carac
|
| 40 |
+
|
| 41 |
+
def plot_skill_graph(self, skill_lists: List[Dict]):
|
| 42 |
+
"""
|
| 43 |
+
This method plots the actual skill graph based on a list of provided skills per candidate
|
| 44 |
+
|
| 45 |
+
:param skill_list: Skills per candidate
|
| 46 |
+
"""
|
| 47 |
+
unique_skills = set([skill for skills in skill_lists for skill in skills.get("skills")])
|
| 48 |
+
|
| 49 |
+
relationships = self.get_edge_df(skill_lists=skill_lists, unique_skills=unique_skills)
|
| 50 |
+
carac = self.get_node_df(skill_lists=skill_lists, unique_skills=unique_skills)
|
| 51 |
+
|
| 52 |
+
# Set overall figure size
|
| 53 |
+
fig, ax = plt.subplots()
|
| 54 |
+
fig.tight_layout()
|
| 55 |
+
|
| 56 |
+
# Create graph object
|
| 57 |
+
G = nx.from_pandas_edgelist(relationships, 'from', 'to', create_using=nx.Graph())
|
| 58 |
+
|
| 59 |
+
# Make types into categories
|
| 60 |
+
carac= carac.set_index('ID')
|
| 61 |
+
carac=carac.reindex(G.nodes())
|
| 62 |
+
|
| 63 |
+
carac['type']=pd.Categorical(carac['type'])
|
| 64 |
+
carac['type'].cat.codes
|
| 65 |
+
|
| 66 |
+
# Set node colors
|
| 67 |
+
cmap = matplotlib.colors.ListedColormap(['dodgerblue', 'lightgray']) #, 'darkorange'])
|
| 68 |
+
|
| 69 |
+
# Set node sizes
|
| 70 |
+
node_sizes = [1000 if entry == 'candidate' else 250 for entry in carac.type]
|
| 71 |
+
|
| 72 |
+
if self.graph_type == "spring":
|
| 73 |
+
|
| 74 |
+
pos = nx.spring_layout(G)
|
| 75 |
+
# Create Layouts
|
| 76 |
+
nx.draw(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
|
| 77 |
+
node_size = node_sizes, edgecolors='gray')
|
| 78 |
+
|
| 79 |
+
elif self.graph_type == "shell":
|
| 80 |
+
pos = nx.shell_layout(G)
|
| 81 |
+
nx.draw_shell(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
|
| 82 |
+
node_size = node_sizes, edgecolors='gray')
|
| 83 |
+
|
| 84 |
+
for node, (x, y) in pos.items():
|
| 85 |
+
text(x, y, node, fontsize=8, ha='center', va='center')
|
| 86 |
+
|
| 87 |
+
plt.title('European Skills, Competences, Qualifications and Occupations (ESCO) skill network', fontsize=14)
|
| 88 |
+
|
| 89 |
+
st.pyplot(fig)
|
multi_index_demo/esco_skill_graph/esco_skill_mapping.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
import faiss
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Union
|
| 6 |
+
import logging
|
| 7 |
+
import os
|
| 8 |
+
from paths import REPO_DIR_PATH
|
| 9 |
+
|
| 10 |
+
logging.basicConfig(level=logging.DEBUG)
|
| 11 |
+
|
| 12 |
+
class EscoSkillApi:
|
| 13 |
+
"""
|
| 14 |
+
The EscoSkillApi object
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self,
|
| 17 |
+
sbert_model: SentenceTransformer,
|
| 18 |
+
index_name: str,
|
| 19 |
+
top_k: int):
|
| 20 |
+
|
| 21 |
+
self.esco_skills = self.load_esco_dataset(filepath=os.environ["ESCO_NER_SEARCHTERMS"])
|
| 22 |
+
self.sbert_model = sbert_model
|
| 23 |
+
self.index_name = index_name
|
| 24 |
+
try:
|
| 25 |
+
self.index = faiss.read_index((REPO_DIR_PATH / "esco_skill_graph"/ self.index_name).as_posix())
|
| 26 |
+
except:
|
| 27 |
+
self.create_index()
|
| 28 |
+
self.top_k = top_k
|
| 29 |
+
|
| 30 |
+
def load_esco_dataset(self, filepath: str) -> List[str]:
|
| 31 |
+
"""
|
| 32 |
+
This method loads a Dataset with the European Skills and Competencies
|
| 33 |
+
|
| 34 |
+
:param filepath: Filepath to the ESCO dataset
|
| 35 |
+
|
| 36 |
+
:return: List of ESCO skills
|
| 37 |
+
"""
|
| 38 |
+
skill_search_df = pd.read_csv(filepath)
|
| 39 |
+
esco_skills = list(sorted(set(skill_search_df.skill.astype(str).tolist())))
|
| 40 |
+
|
| 41 |
+
return esco_skills
|
| 42 |
+
|
| 43 |
+
def create_index(self):
|
| 44 |
+
|
| 45 |
+
encoded_data = self.sbert_model.encode(self.esco_skills)
|
| 46 |
+
self.index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
|
| 47 |
+
self.index.add_with_ids(encoded_data, np.array(range(0, len(self.esco_skills))))
|
| 48 |
+
faiss.write_index(self.index, 'esco_skill_index')
|
| 49 |
+
|
| 50 |
+
def run_query(self, query: str) -> Union[List, str]:
|
| 51 |
+
|
| 52 |
+
query_vector = self.sbert_model.encode([query])
|
| 53 |
+
top_k = self.index.search(query_vector, self.top_k)
|
| 54 |
+
|
| 55 |
+
results = [self.esco_skills[_id] for _id in top_k[1].tolist()[0]]
|
| 56 |
+
|
| 57 |
+
if top_k == 1:
|
| 58 |
+
return results[0]
|
| 59 |
+
else:
|
| 60 |
+
return results
|
| 61 |
+
|
| 62 |
+
def run_queries(self, queries: List[str]):
|
| 63 |
+
query_results = []
|
| 64 |
+
for query in queries:
|
| 65 |
+
res = self.run_query(query=query)
|
| 66 |
+
query_results.append(res[0])
|
| 67 |
+
return query_results
|
multi_index_demo/esco_skill_graph/skill_list_prompt.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Act as if you are an HR recruiter reading the CV of a candidate and extracting a list
|
| 2 |
+
of skills from a Curriculum Vitae.
|
| 3 |
+
|
| 4 |
+
Provide a semicolon separated list of skills based on a provided CV.
|
| 5 |
+
|
| 6 |
+
Examples:
|
| 7 |
+
|
| 8 |
+
Context: The hard-skills of Data Scientist 1 are: python, R, Databricks, Docker, Tableau, PowerBI, AWS, Azure, Kubernetes
|
| 9 |
+
|
| 10 |
+
Skills: python; R; Databricks; Docker; Tableau; PowerBI; AWS; Azure; Kubernetes
|
| 11 |
+
|
| 12 |
+
Context: Hard-skills of Data Scientist 2 include: Programming Languages (Python, C#, Java, JavaScript, HTML/CSS), Problem-solving, Teamwork, Communication, Analytical Thinking.
|
| 13 |
+
|
| 14 |
+
Skills: Python; C#; Java; JAvaScript; HTML/CSS; Problem-Solving; Teamwork; Communication; Analytical Thinking
|
| 15 |
+
|
| 16 |
+
Context: {context}
|
| 17 |
+
|
| 18 |
+
Skills:
|
multi_index_demo/indexing_utils.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from llama_index import ServiceContext, PromptHelper
|
| 2 |
+
from llama_index.llms import OpenAI
|
| 3 |
+
from llama_index.embeddings import OpenAIEmbedding
|
| 4 |
+
from llama_index.node_parser import SimpleNodeParser
|
| 5 |
+
from config import AppConfig
|
| 6 |
+
import streamlit as st
|
| 7 |
+
from llama_index import ServiceContext, Document, PromptHelper
|
| 8 |
+
from llama_index.llms import OpenAI
|
| 9 |
+
from llama_index.embeddings import OpenAIEmbedding
|
| 10 |
+
from llama_index.tools import QueryEngineTool, ToolMetadata
|
| 11 |
+
from llama_index.query_engine import SubQuestionQueryEngine
|
| 12 |
+
from llama_index import GPTVectorStoreIndex
|
| 13 |
+
from llama_index.node_parser import SimpleNodeParser
|
| 14 |
+
from typing import List, Dict
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ServiceContextLoader:
|
| 18 |
+
"""
|
| 19 |
+
This is a simple loader for the ServiceContext
|
| 20 |
+
"""
|
| 21 |
+
def load(self) -> ServiceContext:
|
| 22 |
+
"""
|
| 23 |
+
This method loads the ServiceContext
|
| 24 |
+
|
| 25 |
+
:return: Initialized ServiceContext for the Streamlit application
|
| 26 |
+
"""
|
| 27 |
+
# Initialize an LLM Api Wrapper
|
| 28 |
+
llm = OpenAI(temperature=self.app_config.LLMConfig.temperature,
|
| 29 |
+
model=self.app_config.LLMConfig.model,
|
| 30 |
+
max_tokens=self.app_config.LLMConfig.max_tokens)
|
| 31 |
+
|
| 32 |
+
# Initialize an LLM Embedding Model
|
| 33 |
+
embed_model = OpenAIEmbedding()
|
| 34 |
+
|
| 35 |
+
# Initialize a NodeParser from Documents to Nodes
|
| 36 |
+
node_parser = SimpleNodeParser.from_defaults(chunk_size=self.app_config.SimpleNodeParser.chunk_size,
|
| 37 |
+
chunk_overlap=self.app_config.SimpleNodeParser.chunk_overlap)
|
| 38 |
+
|
| 39 |
+
# Initialize a PromptHelper with the prompt parameters
|
| 40 |
+
prompt_helper = PromptHelper(
|
| 41 |
+
context_window=self.app_config.PromptHelper.context_window,
|
| 42 |
+
# num_output=256,
|
| 43 |
+
chunk_overlap_ratio=self.app_config.PromptHelper.chunk_overlap_ratio,
|
| 44 |
+
chunk_size_limit=self.app_config.PromptHelper.chunk_size_limit
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Initialize a ServiceContext for the query engine including the LLM, Embedding, NodeParser and PromptHelper
|
| 48 |
+
service_context = ServiceContext.from_defaults(
|
| 49 |
+
llm=llm,
|
| 50 |
+
embed_model=embed_model,
|
| 51 |
+
node_parser=node_parser,
|
| 52 |
+
prompt_helper=prompt_helper
|
| 53 |
+
)
|
| 54 |
+
return service_context
|
| 55 |
+
|
| 56 |
+
def __init__(self, app_config: AppConfig):
|
| 57 |
+
self.app_config = app_config
|
| 58 |
+
|
| 59 |
+
@st.cache_resource
|
| 60 |
+
def create_multi_index(file_content_list: List[Dict], _service_context: ServiceContext,
|
| 61 |
+
top_k: int=3) -> SubQuestionQueryEngine:
|
| 62 |
+
"""
|
| 63 |
+
This method creates a SubQuestionQueryEngine Multi-Index based on the indices for individual pdf pages
|
| 64 |
+
|
| 65 |
+
:param app_config: AppConfig object configuring the Streamlit application
|
| 66 |
+
:param file_content_list: List with the content per pdf file
|
| 67 |
+
|
| 68 |
+
:return: Multi-index query engine
|
| 69 |
+
"""
|
| 70 |
+
file_2_index = {}
|
| 71 |
+
file_2_engine = {}
|
| 72 |
+
for file_content in file_content_list:
|
| 73 |
+
|
| 74 |
+
documents = [Document(text=file_content.get('text'))]
|
| 75 |
+
|
| 76 |
+
index_name = file_content.get('index_name')
|
| 77 |
+
engine_name = file_content.get('engine_name')
|
| 78 |
+
title = file_content.get('title')
|
| 79 |
+
|
| 80 |
+
# Initialize independently named GPTVectorStoreIndex objects on the fly
|
| 81 |
+
# e.g. index = GPTVectorStoreIndex.from_documents(documents)
|
| 82 |
+
exec(f"{index_name} = GPTVectorStoreIndex.from_documents(documents)")
|
| 83 |
+
|
| 84 |
+
# Initialize independently named query engines on the fly
|
| 85 |
+
# e.g. engine = index.as_query_engine(service_context=_service_context, similarity_top_k={top_k})
|
| 86 |
+
exec(f"{engine_name} = {index_name}.as_query_engine(service_context=_service_context, similarity_top_k={top_k})")
|
| 87 |
+
|
| 88 |
+
# Store each index and query engine in a dictionary
|
| 89 |
+
exec(f"file_2_index[title] = {index_name}")
|
| 90 |
+
exec(f"file_2_engine[title] = {engine_name}")
|
| 91 |
+
|
| 92 |
+
# Define a List of QueryEngineTools wrapping all individual pdf file indices
|
| 93 |
+
query_engine_tools = [
|
| 94 |
+
QueryEngineTool(
|
| 95 |
+
query_engine=engine,
|
| 96 |
+
metadata=ToolMetadata(
|
| 97 |
+
name=title.replace(" ", "_"),
|
| 98 |
+
description=title,
|
| 99 |
+
),
|
| 100 |
+
)
|
| 101 |
+
for title, engine in file_2_engine.items()
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
# Initialize a multi-index query engine based on all QueryEngineTools
|
| 105 |
+
s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)
|
| 106 |
+
|
| 107 |
+
return s_engine
|
multi_index_demo/paths.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
REPO_DIR_PATH = Path(__file__).parent
|
multi_index_demo/query_executers.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from llama_index.query_engine import SubQuestionQueryEngine
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
from streamlit_utils import icon
|
| 8 |
+
from esco_skill_graph.esco_skill_extractor import SkillsToList
|
| 9 |
+
from esco_skill_graph.esco_skill_mapping import EscoSkillApi
|
| 10 |
+
from esco_skill_graph.esco_skill_graph import SkillGraph
|
| 11 |
+
from config import AppConfig
|
| 12 |
+
from response_clustering import ResponseClustering
|
| 13 |
+
from llama_index.response.schema import Response
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class QueryExecuter:
|
| 17 |
+
"""
|
| 18 |
+
This class executes queries against the multi-index and summarizes the results
|
| 19 |
+
"""
|
| 20 |
+
def __init__(self, query_engine: SubQuestionQueryEngine,
|
| 21 |
+
sbert_model: SentenceTransformer,
|
| 22 |
+
config: AppConfig):
|
| 23 |
+
self.query_engine = query_engine
|
| 24 |
+
self.sbert_model = sbert_model
|
| 25 |
+
self.config = config
|
| 26 |
+
|
| 27 |
+
def run(self):
|
| 28 |
+
"""
|
| 29 |
+
This QueryExecuter runs a query based on the provided input text and
|
| 30 |
+
visualizes the query results
|
| 31 |
+
"""
|
| 32 |
+
query_text = st.text_input("", "Search...")
|
| 33 |
+
icon("search")
|
| 34 |
+
button_clicked = st.button("OK")
|
| 35 |
+
|
| 36 |
+
if button_clicked:
|
| 37 |
+
response = self.query_engine.query(str(query_text))
|
| 38 |
+
st.title("Raw Search results: ")
|
| 39 |
+
st.write(f"**Query: {query_text}**")
|
| 40 |
+
|
| 41 |
+
response_df = self.response_nodes_2_df(response=response)
|
| 42 |
+
|
| 43 |
+
st.markdown("""**Raw Response**""")
|
| 44 |
+
st.write(f"""{response.response}""")
|
| 45 |
+
st.write(response_df)
|
| 46 |
+
|
| 47 |
+
resp_clustering = ResponseClustering(sbert_model=self.sbert_model)
|
| 48 |
+
resp_clustering.compute_response_clusters(response_df=response_df)
|
| 49 |
+
|
| 50 |
+
if "skill" in query_text.lower():
|
| 51 |
+
st.title("Network analysis of skills")
|
| 52 |
+
self.visualize_skill_graph(response_df=response_df)
|
| 53 |
+
|
| 54 |
+
def response_nodes_2_df(self, response: Response) -> pd.DataFrame:
|
| 55 |
+
"""
|
| 56 |
+
This method returns the response node content as a DataFrame
|
| 57 |
+
|
| 58 |
+
:param: Response object to be formatted as a pandas DataFrame
|
| 59 |
+
|
| 60 |
+
:return: Response object as a pandas DataFrame
|
| 61 |
+
"""
|
| 62 |
+
data_list = []
|
| 63 |
+
for ind, node in enumerate(response.__dict__["source_nodes"]):
|
| 64 |
+
split_node_text = node.node.text.split("\nResponse: \n")
|
| 65 |
+
subquery = split_node_text[0]
|
| 66 |
+
sub_response = split_node_text[-1]
|
| 67 |
+
data_list.append({"id": f"Data Scientist {ind + 1}", "response": sub_response, "subquery": subquery, })
|
| 68 |
+
|
| 69 |
+
df = pd.DataFrame(data_list)
|
| 70 |
+
|
| 71 |
+
return df
|
| 72 |
+
|
| 73 |
+
def visualize_skill_graph(self, response_df: pd.DataFrame):
|
| 74 |
+
"""
|
| 75 |
+
This method visualizes the SKill Graph
|
| 76 |
+
|
| 77 |
+
:param response_df: DataFrame containing the Query Responses per PDF index
|
| 78 |
+
"""
|
| 79 |
+
skills_2_list = SkillsToList(model_name=self.config.SkillsToListConfig.llm_model_name,
|
| 80 |
+
temperature=self.config.SkillsToListConfig.temperature,
|
| 81 |
+
prompt_template_path=self.config.SkillsToListConfig.prompt_template)
|
| 82 |
+
|
| 83 |
+
skill_lists = skills_2_list(skill_descriptions=response_df.response.tolist())
|
| 84 |
+
|
| 85 |
+
esco_api = EscoSkillApi(sbert_model=self.sbert_model,
|
| 86 |
+
index_name=self.config.EscoSkillApiConfig.index_name,
|
| 87 |
+
top_k=self.config.EscoSkillApiConfig.top_k)
|
| 88 |
+
|
| 89 |
+
skill_lists_2_graph = []
|
| 90 |
+
for ind, skill_list in enumerate(skill_lists):
|
| 91 |
+
normalized_skills = esco_api.run_queries(queries=skill_list)
|
| 92 |
+
skill_lists_2_graph.append({
|
| 93 |
+
"id": f"Data Scientist {ind + 1}",
|
| 94 |
+
"skills": normalized_skills
|
| 95 |
+
})
|
| 96 |
+
|
| 97 |
+
skill_graph = SkillGraph(graph_type="spring")
|
| 98 |
+
skill_graph.plot_skill_graph(skill_lists=skill_lists_2_graph)
|
multi_index_demo/response_clustering.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scipy.cluster.hierarchy import linkage, dendrogram
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import seaborn as sns
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class ClusterResult:
|
| 14 |
+
similarity_df: pd.DataFrame
|
| 15 |
+
cluster_df: pd.DataFrame
|
| 16 |
+
|
| 17 |
+
class ResponseClustering:
|
| 18 |
+
"""
|
| 19 |
+
This class performs clustering of the Query Responses
|
| 20 |
+
"""
|
| 21 |
+
def __init__(self, sbert_model: SentenceTransformer):
|
| 22 |
+
self.sbert_model = sbert_model
|
| 23 |
+
|
| 24 |
+
def compute_response_clusters(self, response_df: pd.DataFrame) -> ClusterResult:
|
| 25 |
+
"""
|
| 26 |
+
This method encodes the responses via SBERT, computes the cosine similarity of the
|
| 27 |
+
|
| 28 |
+
:param response_df: This is a DataFrame containing the query responses
|
| 29 |
+
|
| 30 |
+
:return: ClusterResult object with the cosine similarity DataFrame and the Cluster Result DataFrame
|
| 31 |
+
"""
|
| 32 |
+
embeddings = self.sbert_model.encode(response_df.response.tolist())
|
| 33 |
+
|
| 34 |
+
self.plot_cluster_dendrogram(encodings=embeddings)
|
| 35 |
+
|
| 36 |
+
# Compute the cosine similarity of the embeddings and plot the heatmap
|
| 37 |
+
similarity_mat = cosine_similarity(embeddings)
|
| 38 |
+
cosine_sim_df = pd.DataFrame(similarity_mat, columns=response_df.id, index=response_df.id)
|
| 39 |
+
self.plot_heatmap(similarity_df=cosine_sim_df)
|
| 40 |
+
|
| 41 |
+
# Perform agglomerative clustering and plot the dendrogram
|
| 42 |
+
clustering = AgglomerativeClustering().fit(embeddings)
|
| 43 |
+
response_df["cluster_labels"] = clustering.labels_
|
| 44 |
+
response_df = response_df.sort_values("cluster_labels")
|
| 45 |
+
cluster_result = ClusterResult(similarity_df=cosine_sim_df, cluster_df=response_df)
|
| 46 |
+
|
| 47 |
+
return cluster_result
|
| 48 |
+
|
| 49 |
+
def plot_cluster_dendrogram(self, encodings: np.ndarray):
|
| 50 |
+
# Calculate the linkage: mergings
|
| 51 |
+
mergings = linkage(encodings, method='ward')
|
| 52 |
+
|
| 53 |
+
fig, ax = plt.subplots()
|
| 54 |
+
# Plot the dendrogram, using varieties as labels
|
| 55 |
+
fig.tight_layout()
|
| 56 |
+
|
| 57 |
+
dendrogram(mergings,
|
| 58 |
+
labels=[f"skills Data Scientist {i + 1}" for i in range(encodings.shape[0])],
|
| 59 |
+
leaf_rotation=90,
|
| 60 |
+
leaf_font_size=6,
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
plt.title("Clustering Dendrogram of the Data Scientist query-response embeddings")
|
| 64 |
+
plt.xticks(rotation = 0)
|
| 65 |
+
# plt.show()
|
| 66 |
+
st.pyplot(fig)
|
| 67 |
+
|
| 68 |
+
def plot_heatmap(self, similarity_df: pd.DataFrame):
|
| 69 |
+
fig1, ax1 = plt.subplots()
|
| 70 |
+
# Plot the dendrogram, using varieties as labels
|
| 71 |
+
fig1.tight_layout()
|
| 72 |
+
plt.title("Cosine-similarity heatmap of the Data Scientist query-response embeddings")
|
| 73 |
+
sns.heatmap(similarity_df, cmap="viridis")
|
| 74 |
+
st.pyplot(fig1)
|
multi_index_demo/streamlit_utils.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import fitz
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import re
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
|
| 7 |
+
def local_css(file_name):
|
| 8 |
+
with open(file_name) as f:
|
| 9 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
| 10 |
+
|
| 11 |
+
def remote_css(url):
|
| 12 |
+
st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)
|
| 13 |
+
|
| 14 |
+
def icon(icon_name):
|
| 15 |
+
st.markdown(f'<i class="material-icons">{icon_name}</i>', unsafe_allow_html=True)
|
| 16 |
+
|
| 17 |
+
@st.cache_data
|
| 18 |
+
def load_pdf_files(files: List) -> List[Dict]:
|
| 19 |
+
"""
|
| 20 |
+
This method loads and caches the content of PDF Files based on a list of file objects
|
| 21 |
+
|
| 22 |
+
:param files: File objects
|
| 23 |
+
|
| 24 |
+
:return: List of pdf content
|
| 25 |
+
"""
|
| 26 |
+
content_list = []
|
| 27 |
+
if len(files) > 0:
|
| 28 |
+
for ind, file in enumerate(files):
|
| 29 |
+
with fitz.open(stream=file.read(), filetype="pdf") as doc:
|
| 30 |
+
page_texts = []
|
| 31 |
+
for page in doc:
|
| 32 |
+
page_text = page.get_text()
|
| 33 |
+
page_texts.append(page_text)
|
| 34 |
+
|
| 35 |
+
page_text = "\n".join(page_texts)
|
| 36 |
+
|
| 37 |
+
cv_id = ind + 1
|
| 38 |
+
title = re.sub("\.pdf", "", file.name)
|
| 39 |
+
title = f"{title} Data Scientist {cv_id}"
|
| 40 |
+
engine_name = re.sub(" ", "_", title)
|
| 41 |
+
tmp_engine_name = f"{engine_name}_query_engine"
|
| 42 |
+
tmp_index_name = f"{engine_name}_index"
|
| 43 |
+
|
| 44 |
+
content_list.append({
|
| 45 |
+
"engine_name": tmp_engine_name,
|
| 46 |
+
"index_name": tmp_index_name,
|
| 47 |
+
"text": page_text,
|
| 48 |
+
"title": title
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
return content_list
|
multi_index_demo/style.css
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body {
|
| 2 |
+
color: #fff;
|
| 3 |
+
background-color: #4F8BF9;
|
| 4 |
+
}
|
| 5 |
+
|
| 6 |
+
.stButton>button {
|
| 7 |
+
color: #4F8BF9;
|
| 8 |
+
border-radius: 50%;
|
| 9 |
+
height: 3em;
|
| 10 |
+
width: 3em;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
.stTextInput>div>div>input {
|
| 14 |
+
color: #4F8BF9;
|
| 15 |
+
}
|
poetry.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[tool.poetry]
|
| 2 |
+
name = "multi-index-demo"
|
| 3 |
+
version = "0.1.2"
|
| 4 |
+
description = "This repository can be used as a demo for multi-index question answering"
|
| 5 |
+
authors = ["christoph_hiemenz <chiemenz@positivethinking.tech>"]
|
| 6 |
+
readme = "README.md"
|
| 7 |
+
packages = [{include = "multi_index_demo"}]
|
| 8 |
+
|
| 9 |
+
[tool.poetry.dependencies]
|
| 10 |
+
python = ">=3.10,<3.13"
|
| 11 |
+
streamlit = "^1.26.0"
|
| 12 |
+
llama-index = "^0.8.11.post3"
|
| 13 |
+
sentence-transformers = "^2.2.2"
|
| 14 |
+
networkx = "^3.1"
|
| 15 |
+
scikit-learn = "^1.3.0"
|
| 16 |
+
scipy = "^1.11.2"
|
| 17 |
+
pandas = "^2.0.3"
|
| 18 |
+
numpy = "^1.25.2"
|
| 19 |
+
matplotlib = "^3.7.2"
|
| 20 |
+
pymupdf = "^1.23.1"
|
| 21 |
+
faiss-cpu = "^1.7.4"
|
| 22 |
+
seaborn = "^0.12.2"
|
| 23 |
+
python-dotenv = "^1.0.0"
|
| 24 |
+
|
| 25 |
+
[tool.pytest.ini_options]
|
| 26 |
+
minversion = "6.0"
|
| 27 |
+
addopts = "-ra -q"
|
| 28 |
+
testpaths = [
|
| 29 |
+
"tests"
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
[build-system]
|
| 33 |
+
requires = ["poetry-core"]
|
| 34 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.26.0
|
| 2 |
+
llama-index==0.8.11.post3
|
| 3 |
+
sentence-transformers==2.2.2
|
| 4 |
+
networkx==3.1
|
| 5 |
+
scikit-learn==1.3.0
|
| 6 |
+
scipy==1.11.2
|
| 7 |
+
pandas==2.0.3
|
| 8 |
+
numpy==1.25.2
|
| 9 |
+
matplotlib==3.7.2
|
| 10 |
+
pymupdf==1.23.1
|
| 11 |
+
faiss-cpu==1.7.4
|
| 12 |
+
seaborn==0.12.2
|
| 13 |
+
python-dotenv==1.0.0
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_version.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from multi_index_demo import __version__
|
| 2 |
+
|
| 3 |
+
def test_version():
|
| 4 |
+
print(__version__)
|
| 5 |
+
assert __version__ == "0.1.1"
|