jaothan commited on
Commit
5559758
·
verified ·
1 Parent(s): 458a630

Upload 27 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/multi_index_queries_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/rag_overview_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Data
10
+ *.json
11
+ *.jsonl
12
+ *.pickle
13
+ *.xlsx
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ *.egg-info/
30
+ .installed.cfg
31
+ *.egg
32
+ MANIFEST
33
+
34
+ # PyInstaller
35
+ # Usually these files are written by a python script from a template
36
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
37
+ *.manifest
38
+ *.spec
39
+
40
+ # Installer logs
41
+ pip-log.txt
42
+ pip-delete-this-directory.txt
43
+
44
+ # Unit test / coverage reports
45
+ htmlcov/
46
+ .tox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ .hypothesis/
54
+ .pytest_cache/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ #*.log
62
+ local_settings.py
63
+ db.sqlite3
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # pyenv
82
+ .python-version
83
+
84
+ # celery beat schedule file
85
+ celerybeat-schedule
86
+
87
+ # SageMath parsed files
88
+ *.sage.py
89
+
90
+ # Environments
91
+ .env
92
+ .venv
93
+ env/
94
+ venv/
95
+ ENV/
96
+ env.bak/
97
+ venv.bak/
98
+
99
+ # Spyder project settings
100
+ .spyderproject
101
+ .spyproject
102
+
103
+ # Rope project settings
104
+ .ropeproject
105
+
106
+ # mkdocs documentation
107
+ /site
108
+
109
+ # mypy
110
+ .mypy_cache/
111
+
112
+ # idea
113
+ .idea/*
114
+ .idea
115
+ idea/
116
+
117
+ # Exclude model.bin files from repo
118
+ *.bin
119
+ *vocab.txt
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
assets/indexing_stage_positive_thinking_company.png ADDED
assets/multi_index_queries_positive_thinking_company.png ADDED

Git LFS Details

  • SHA256: cd4e430c1c6257673de8d29ebf3b8b91436da2afcc87515b709214ac0d395b8e
  • Pointer size: 131 Bytes
  • Size of remote file: 195 kB
assets/rag_overview_positive_thinking_company.png ADDED

Git LFS Details

  • SHA256: f94d88d633942c5bb21d9e083cb5ed54f5884f54de00baeed5801f5494240a96
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
multi_index_demo/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = "0.1.2"
multi_index_demo/app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from pathlib import Path
4
+ from sentence_transformers import SentenceTransformer
5
+ from streamlit_utils import local_css, remote_css, load_pdf_files
6
+ from config import load_app_config, set_global_api_key
7
+ from indexing_utils import ServiceContextLoader, create_multi_index
8
+ from query_executers import QueryExecuter
9
+
10
+
11
+ def main():
12
+ """
13
+ This is the main method of the streamlit application
14
+ """
15
+ # Set the OpenAI Api key as an environment variable
16
+ set_global_api_key()
17
+
18
+ dirname = Path(os.path.dirname(__file__))
19
+ local_css((dirname /"style.css").as_posix())
20
+ remote_css('https://fonts.googleapis.com/icon?family=Material+Icons')
21
+
22
+ # Load a Configuration object for the application
23
+ app_config = load_app_config()
24
+
25
+ # Initialize a ServiceContext for the QueryEngine
26
+ service_context = ServiceContextLoader(app_config=app_config).load()
27
+
28
+ # Initialize a simple SentenceTransformer model for clustering the final responses
29
+ sbert_model = SentenceTransformer(app_config.ClusteringConfig.SentenceTransformerModel)
30
+
31
+ st.title("Parallel Multi-Document Question Answering")
32
+
33
+ # Provide a file_uploader with drag and drop functionality
34
+ multiple_files = st.file_uploader(
35
+ "Drop multiple files:", accept_multiple_files=True
36
+ )
37
+
38
+ if multiple_files is None:
39
+ st.text("No upload")
40
+ else:
41
+ files = [file for file in multiple_files if str(file.name).endswith(".pdf")]
42
+
43
+ # Load the pdf files based on the file objects
44
+ file_content_list = load_pdf_files(files=files)
45
+
46
+ if file_content_list:
47
+ top_k = app_config.QueryEngineConfig.similarity_top_k
48
+
49
+ # Create a multi-index query engine based on the pdf file content
50
+ multi_index_query_engine = create_multi_index(file_content_list=file_content_list,
51
+ _service_context=service_context,
52
+ top_k=top_k)
53
+
54
+ # Execute the query and display the results in the streamlit app
55
+ query_executer = QueryExecuter(query_engine=multi_index_query_engine,
56
+ sbert_model=sbert_model,
57
+ config=app_config)
58
+ query_executer.run()
59
+
60
+ if __name__ == "__main__":
61
+ main()
multi_index_demo/app_config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configure the Large Language Model Api calls
2
+ LLMConfig:
3
+ temperature: 0
4
+ model: "text-davinci-003"
5
+ max_tokens: 250
6
+ # Configure how the pdf file texts should be chunked into Node texts
7
+ SimpleNodeParser:
8
+ chunk_size: 1024
9
+ chunk_overlap: 20
10
+ # Specify how many contexts to retrieve in the retrieval step
11
+ QueryEngineConfig:
12
+ similarity_top_k: 3
13
+ # Configure the prompt for response synthesis
14
+ PromptHelper:
15
+ context_window: 4096
16
+ chunk_overlap_ratio: 0.1
17
+ chunk_size_limit: null
18
+ # Specify the SentenceTransformer model for semantic clustering of responses
19
+ ClusteringConfig:
20
+ SentenceTransformerModel: "all-mpnet-base-v2"
21
+ # Specify the Config for the Esco Faiss index skill query API
22
+ EscoSkillApiConfig:
23
+ index_name: esco_skill_index
24
+ top_k: 1
25
+ # This is a config for the Api to extract a semicolon separated list of skills from a Skill description text
26
+ SkillsToListConfig:
27
+ llm_model_name: "text-davinci-003"
28
+ temperature: 0.0
29
+ prompt_template: skill_list_prompt.txt
multi_index_demo/config.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ from pydantic import BaseModel
3
+ from typing import Optional
4
+ import os
5
+ from pathlib import Path
6
+ import openai
7
+ from dotenv import load_dotenv
8
+
9
+ class LLMConfig(BaseModel):
10
+ """
11
+ Configuration of the LLM prompt parameters
12
+ """
13
+ temperature: float
14
+ model: str
15
+ max_tokens: int
16
+
17
+ class SimpleNodeParser(BaseModel):
18
+ """
19
+ Configuration of the simple Document to Node parser
20
+ """
21
+ chunk_size: int
22
+ chunk_overlap: int
23
+
24
+ class PromptHelper(BaseModel):
25
+ """
26
+ This is a Prompt Helper config which configures actual constraints of the prompt context size
27
+ and overlap between chunks
28
+ """
29
+ context_window: int
30
+ chunk_overlap_ratio: float
31
+ chunk_size_limit: Optional[int]
32
+
33
+ class QueryEngineConfig(BaseModel):
34
+ """
35
+ Configuration of the query engine
36
+ """
37
+ similarity_top_k: int
38
+
39
+ class ClusteringConfig(BaseModel):
40
+ """
41
+ Configuration of the SentenceTransformer Embedding model for cluster analysis
42
+ """
43
+ SentenceTransformerModel: str
44
+
45
+ class EscoSkillApiConfig(BaseModel):
46
+ index_name: str
47
+ top_k: int
48
+
49
+ class SkillsToListConfig(BaseModel):
50
+ llm_model_name: str
51
+ temperature: float
52
+ prompt_template: str
53
+
54
+ class AppConfig(BaseModel):
55
+ """
56
+ This is a basic config object for the Streamlit Application
57
+ """
58
+ LLMConfig: LLMConfig
59
+ PromptHelper: PromptHelper
60
+ SimpleNodeParser: SimpleNodeParser
61
+ QueryEngineConfig: QueryEngineConfig
62
+ ClusteringConfig: ClusteringConfig
63
+ EscoSkillApiConfig: EscoSkillApiConfig
64
+ SkillsToListConfig: SkillsToListConfig
65
+
66
+ def load_app_config() -> AppConfig:
67
+ """
68
+ This method loads the AppConfig object
69
+
70
+ :return: Initialized AppConfig object
71
+ """
72
+ dirname = Path(os.path.dirname(__file__))
73
+ with (dirname / "app_config.yaml").open("r", encoding="utf-8") as f:
74
+ app_config_dict = yaml.safe_load(f)
75
+
76
+ app_config_obj = AppConfig(**app_config_dict)
77
+
78
+ return app_config_obj
79
+
80
+ def set_global_api_key():
81
+ """
82
+ This method sets the API key globally
83
+ """
84
+ load_dotenv()
85
+
86
+ openai.api_key = os.environ["OPENAI_API_KEY"]
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Anna Mustermann.pdf ADDED
Binary file (78.8 kB). View file
 
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae John Doe.pdf ADDED
Binary file (78.8 kB). View file
 
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Max Mustermann.pdf ADDED
Binary file (79.2 kB). View file
 
multi_index_demo/esco_skill_graph/esco_skill_extractor.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain import PromptTemplate
2
+ from langchain.llms import OpenAI
3
+ import os
4
+ from config import set_global_api_key
5
+ from pathlib import Path
6
+ from typing import List
7
+ from paths import REPO_DIR_PATH
8
+
9
+ set_global_api_key()
10
+
11
+
12
+ class SkillsToList:
13
+ """
14
+ This is a query engine which does extract skills from plain text skill descriptions
15
+ """
16
+ def __init__(self, model_name: str, temperature: float, prompt_template_path: str):
17
+ """
18
+ :param model_name: Name of the LLM to be used for prompting
19
+ :param temperature: Temperature prompt parameter, high - explorative, low - conservative
20
+ :param prompt_template_path: Filepath to the prompt template
21
+ """
22
+ # initialize the LLM Api
23
+ self.openai_engine = OpenAI(
24
+ model_name=model_name,
25
+ openai_api_key=os.environ["OPENAI_API_KEY"],
26
+ temperature=temperature
27
+ )
28
+
29
+ # Read a prompt template for skill extraction from a list of skills
30
+ prompt_path = REPO_DIR_PATH / "esco_skill_graph" / prompt_template_path
31
+ with prompt_path.open("r", encoding="utf-8") as f:
32
+ template = f.read()
33
+
34
+ # Create a prompt template
35
+ self.prompt_template = PromptTemplate(
36
+ input_variables=["context"],
37
+ template=template
38
+ )
39
+
40
+ def extract_skill_list(self, skill_description: str) -> List[str]:
41
+ """"
42
+ This method extracts a list of skills
43
+
44
+ :param skill_description: A Descriptive text outlining Soft-skills & Hard-Skills
45
+
46
+ :return: List of skills
47
+ """
48
+ prompt = self.prompt_template.format(context=skill_description)
49
+
50
+ result = self.openai_engine(prompt)
51
+
52
+ skills = [skill.strip() for skill in result.strip().split(";")]
53
+
54
+ return skills
55
+
56
+ def __call__(self, skill_descriptions: List[str]) -> List[List[str]]:
57
+ """
58
+ This method returns lists of separate skills
59
+
60
+ :param skill_descriptions: List of plain text skill descriptions
61
+
62
+ :return: Lists of skills
63
+ """
64
+ skill_lists = []
65
+ for skill_desc in skill_descriptions:
66
+ skills = self.extract_skill_list(skill_description=skill_desc)
67
+ skill_lists.append(skills)
68
+
69
+ return skill_lists
multi_index_demo/esco_skill_graph/esco_skill_graph.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import networkx as nx
3
+ import matplotlib.pyplot as plt
4
+ import matplotlib
5
+ import streamlit as st
6
+ from typing import List, Dict, Union, Set
7
+ from matplotlib.pyplot import figure, text
8
+
9
+ class SkillGraph:
10
+ """
11
+ This is a Graph for visualizing the overlap of skills and competencies of different Candidates
12
+ """
13
+ def __init__(self, graph_type: str="spring"):
14
+ assert graph_type in ["spring", "shell"], "graph type must be 'spring' or 'shell'"
15
+ self.graph_type = graph_type
16
+
17
+ def get_edge_df(self, skill_lists: List[Dict], unique_skills: Set) -> pd.DataFrame:
18
+ """
19
+ This method returns a DataFrame with the Edges of the Graph
20
+ """
21
+ relationship_list = []
22
+ for skill_dict in skill_lists:
23
+ candidate_id = skill_dict.get("id")
24
+ intersections = unique_skills.intersection(set(skill_dict.get("skills")))
25
+ for skill in list(set(intersections)):
26
+ relationship_list.append({"from": candidate_id, "to": skill})
27
+
28
+ relationships = pd.DataFrame(relationship_list)
29
+ return relationships
30
+
31
+ def get_node_df(self, unique_skills: Set, skill_lists: List[Dict]) -> pd.DataFrame:
32
+ """
33
+ This method returns a DataFrame with the nodes of the Graph
34
+ """
35
+ candidate_nodes = [{"ID": skill_dict.get("id"), "type": "candidate"} for skill_dict in skill_lists]
36
+ skill_nodes = [{"ID": skill, "type": "skill"} for skill in unique_skills]
37
+ nodes = candidate_nodes + skill_nodes
38
+ carac = pd.DataFrame(nodes)
39
+ return carac
40
+
41
+ def plot_skill_graph(self, skill_lists: List[Dict]):
42
+ """
43
+ This method plots the actual skill graph based on a list of provided skills per candidate
44
+
45
+ :param skill_list: Skills per candidate
46
+ """
47
+ unique_skills = set([skill for skills in skill_lists for skill in skills.get("skills")])
48
+
49
+ relationships = self.get_edge_df(skill_lists=skill_lists, unique_skills=unique_skills)
50
+ carac = self.get_node_df(skill_lists=skill_lists, unique_skills=unique_skills)
51
+
52
+ # Set overall figure size
53
+ fig, ax = plt.subplots()
54
+ fig.tight_layout()
55
+
56
+ # Create graph object
57
+ G = nx.from_pandas_edgelist(relationships, 'from', 'to', create_using=nx.Graph())
58
+
59
+ # Make types into categories
60
+ carac= carac.set_index('ID')
61
+ carac=carac.reindex(G.nodes())
62
+
63
+ carac['type']=pd.Categorical(carac['type'])
64
+ carac['type'].cat.codes
65
+
66
+ # Set node colors
67
+ cmap = matplotlib.colors.ListedColormap(['dodgerblue', 'lightgray']) #, 'darkorange'])
68
+
69
+ # Set node sizes
70
+ node_sizes = [1000 if entry == 'candidate' else 250 for entry in carac.type]
71
+
72
+ if self.graph_type == "spring":
73
+
74
+ pos = nx.spring_layout(G)
75
+ # Create Layouts
76
+ nx.draw(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
77
+ node_size = node_sizes, edgecolors='gray')
78
+
79
+ elif self.graph_type == "shell":
80
+ pos = nx.shell_layout(G)
81
+ nx.draw_shell(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
82
+ node_size = node_sizes, edgecolors='gray')
83
+
84
+ for node, (x, y) in pos.items():
85
+ text(x, y, node, fontsize=8, ha='center', va='center')
86
+
87
+ plt.title('European Skills, Competences, Qualifications and Occupations (ESCO) skill network', fontsize=14)
88
+
89
+ st.pyplot(fig)
multi_index_demo/esco_skill_graph/esco_skill_mapping.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer
3
+ import faiss
4
+ import numpy as np
5
+ from typing import List, Union
6
+ import logging
7
+ import os
8
+ from paths import REPO_DIR_PATH
9
+
10
+ logging.basicConfig(level=logging.DEBUG)
11
+
12
+ class EscoSkillApi:
13
+ """
14
+ The EscoSkillApi object
15
+ """
16
+ def __init__(self,
17
+ sbert_model: SentenceTransformer,
18
+ index_name: str,
19
+ top_k: int):
20
+
21
+ self.esco_skills = self.load_esco_dataset(filepath=os.environ["ESCO_NER_SEARCHTERMS"])
22
+ self.sbert_model = sbert_model
23
+ self.index_name = index_name
24
+ try:
25
+ self.index = faiss.read_index((REPO_DIR_PATH / "esco_skill_graph"/ self.index_name).as_posix())
26
+ except:
27
+ self.create_index()
28
+ self.top_k = top_k
29
+
30
+ def load_esco_dataset(self, filepath: str) -> List[str]:
31
+ """
32
+ This method loads a Dataset with the European Skills and Competencies
33
+
34
+ :param filepath: Filepath to the ESCO dataset
35
+
36
+ :return: List of ESCO skills
37
+ """
38
+ skill_search_df = pd.read_csv(filepath)
39
+ esco_skills = list(sorted(set(skill_search_df.skill.astype(str).tolist())))
40
+
41
+ return esco_skills
42
+
43
+ def create_index(self):
44
+
45
+ encoded_data = self.sbert_model.encode(self.esco_skills)
46
+ self.index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
47
+ self.index.add_with_ids(encoded_data, np.array(range(0, len(self.esco_skills))))
48
+ faiss.write_index(self.index, 'esco_skill_index')
49
+
50
+ def run_query(self, query: str) -> Union[List, str]:
51
+
52
+ query_vector = self.sbert_model.encode([query])
53
+ top_k = self.index.search(query_vector, self.top_k)
54
+
55
+ results = [self.esco_skills[_id] for _id in top_k[1].tolist()[0]]
56
+
57
+ if top_k == 1:
58
+ return results[0]
59
+ else:
60
+ return results
61
+
62
+ def run_queries(self, queries: List[str]):
63
+ query_results = []
64
+ for query in queries:
65
+ res = self.run_query(query=query)
66
+ query_results.append(res[0])
67
+ return query_results
multi_index_demo/esco_skill_graph/skill_list_prompt.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Act as if you are an HR recruiter reading the CV of a candidate and extracting a list
2
+ of skills from a Curriculum Vitae.
3
+
4
+ Provide a semicolon separated list of skills based on a provided CV.
5
+
6
+ Examples:
7
+
8
+ Context: The hard-skills of Data Scientist 1 are: python, R, Databricks, Docker, Tableau, PowerBI, AWS, Azure, Kubernetes
9
+
10
+ Skills: python; R; Databricks; Docker; Tableau; PowerBI; AWS; Azure; Kubernetes
11
+
12
+ Context: Hard-skills of Data Scientist 2 include: Programming Languages (Python, C#, Java, JavaScript, HTML/CSS), Problem-solving, Teamwork, Communication, Analytical Thinking.
13
+
14
+ Skills: Python; C#; Java; JAvaScript; HTML/CSS; Problem-Solving; Teamwork; Communication; Analytical Thinking
15
+
16
+ Context: {context}
17
+
18
+ Skills:
multi_index_demo/indexing_utils.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index import ServiceContext, PromptHelper
2
+ from llama_index.llms import OpenAI
3
+ from llama_index.embeddings import OpenAIEmbedding
4
+ from llama_index.node_parser import SimpleNodeParser
5
+ from config import AppConfig
6
+ import streamlit as st
7
+ from llama_index import ServiceContext, Document, PromptHelper
8
+ from llama_index.llms import OpenAI
9
+ from llama_index.embeddings import OpenAIEmbedding
10
+ from llama_index.tools import QueryEngineTool, ToolMetadata
11
+ from llama_index.query_engine import SubQuestionQueryEngine
12
+ from llama_index import GPTVectorStoreIndex
13
+ from llama_index.node_parser import SimpleNodeParser
14
+ from typing import List, Dict
15
+
16
+
17
+ class ServiceContextLoader:
18
+ """
19
+ This is a simple loader for the ServiceContext
20
+ """
21
+ def load(self) -> ServiceContext:
22
+ """
23
+ This method loads the ServiceContext
24
+
25
+ :return: Initialized ServiceContext for the Streamlit application
26
+ """
27
+ # Initialize an LLM Api Wrapper
28
+ llm = OpenAI(temperature=self.app_config.LLMConfig.temperature,
29
+ model=self.app_config.LLMConfig.model,
30
+ max_tokens=self.app_config.LLMConfig.max_tokens)
31
+
32
+ # Initialize an LLM Embedding Model
33
+ embed_model = OpenAIEmbedding()
34
+
35
+ # Initialize a NodeParser from Documents to Nodes
36
+ node_parser = SimpleNodeParser.from_defaults(chunk_size=self.app_config.SimpleNodeParser.chunk_size,
37
+ chunk_overlap=self.app_config.SimpleNodeParser.chunk_overlap)
38
+
39
+ # Initialize a PromptHelper with the prompt parameters
40
+ prompt_helper = PromptHelper(
41
+ context_window=self.app_config.PromptHelper.context_window,
42
+ # num_output=256,
43
+ chunk_overlap_ratio=self.app_config.PromptHelper.chunk_overlap_ratio,
44
+ chunk_size_limit=self.app_config.PromptHelper.chunk_size_limit
45
+ )
46
+
47
+ # Initialize a ServiceContext for the query engine including the LLM, Embedding, NodeParser and PromptHelper
48
+ service_context = ServiceContext.from_defaults(
49
+ llm=llm,
50
+ embed_model=embed_model,
51
+ node_parser=node_parser,
52
+ prompt_helper=prompt_helper
53
+ )
54
+ return service_context
55
+
56
+ def __init__(self, app_config: AppConfig):
57
+ self.app_config = app_config
58
+
59
+ @st.cache_resource
60
+ def create_multi_index(file_content_list: List[Dict], _service_context: ServiceContext,
61
+ top_k: int=3) -> SubQuestionQueryEngine:
62
+ """
63
+ This method creates a SubQuestionQueryEngine Multi-Index based on the indices for individual pdf pages
64
+
65
+ :param app_config: AppConfig object configuring the Streamlit application
66
+ :param file_content_list: List with the content per pdf file
67
+
68
+ :return: Multi-index query engine
69
+ """
70
+ file_2_index = {}
71
+ file_2_engine = {}
72
+ for file_content in file_content_list:
73
+
74
+ documents = [Document(text=file_content.get('text'))]
75
+
76
+ index_name = file_content.get('index_name')
77
+ engine_name = file_content.get('engine_name')
78
+ title = file_content.get('title')
79
+
80
+ # Initialize independently named GPTVectorStoreIndex objects on the fly
81
+ # e.g. index = GPTVectorStoreIndex.from_documents(documents)
82
+ exec(f"{index_name} = GPTVectorStoreIndex.from_documents(documents)")
83
+
84
+ # Initialize independently named query engines on the fly
85
+ # e.g. engine = index.as_query_engine(service_context=_service_context, similarity_top_k={top_k})
86
+ exec(f"{engine_name} = {index_name}.as_query_engine(service_context=_service_context, similarity_top_k={top_k})")
87
+
88
+ # Store each index and query engine in a dictionary
89
+ exec(f"file_2_index[title] = {index_name}")
90
+ exec(f"file_2_engine[title] = {engine_name}")
91
+
92
+ # Define a List of QueryEngineTools wrapping all individual pdf file indices
93
+ query_engine_tools = [
94
+ QueryEngineTool(
95
+ query_engine=engine,
96
+ metadata=ToolMetadata(
97
+ name=title.replace(" ", "_"),
98
+ description=title,
99
+ ),
100
+ )
101
+ for title, engine in file_2_engine.items()
102
+ ]
103
+
104
+ # Initialize a multi-index query engine based on all QueryEngineTools
105
+ s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)
106
+
107
+ return s_engine
multi_index_demo/paths.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ REPO_DIR_PATH = Path(__file__).parent
multi_index_demo/query_executers.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.query_engine import SubQuestionQueryEngine
2
+ import streamlit as st
3
+ import pandas as pd
4
+
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ from streamlit_utils import icon
8
+ from esco_skill_graph.esco_skill_extractor import SkillsToList
9
+ from esco_skill_graph.esco_skill_mapping import EscoSkillApi
10
+ from esco_skill_graph.esco_skill_graph import SkillGraph
11
+ from config import AppConfig
12
+ from response_clustering import ResponseClustering
13
+ from llama_index.response.schema import Response
14
+
15
+
16
+ class QueryExecuter:
17
+ """
18
+ This class executes queries against the multi-index and summarizes the results
19
+ """
20
+ def __init__(self, query_engine: SubQuestionQueryEngine,
21
+ sbert_model: SentenceTransformer,
22
+ config: AppConfig):
23
+ self.query_engine = query_engine
24
+ self.sbert_model = sbert_model
25
+ self.config = config
26
+
27
+ def run(self):
28
+ """
29
+ This QueryExecuter runs a query based on the provided input text and
30
+ visualizes the query results
31
+ """
32
+ query_text = st.text_input("", "Search...")
33
+ icon("search")
34
+ button_clicked = st.button("OK")
35
+
36
+ if button_clicked:
37
+ response = self.query_engine.query(str(query_text))
38
+ st.title("Raw Search results: ")
39
+ st.write(f"**Query: {query_text}**")
40
+
41
+ response_df = self.response_nodes_2_df(response=response)
42
+
43
+ st.markdown("""**Raw Response**""")
44
+ st.write(f"""{response.response}""")
45
+ st.write(response_df)
46
+
47
+ resp_clustering = ResponseClustering(sbert_model=self.sbert_model)
48
+ resp_clustering.compute_response_clusters(response_df=response_df)
49
+
50
+ if "skill" in query_text.lower():
51
+ st.title("Network analysis of skills")
52
+ self.visualize_skill_graph(response_df=response_df)
53
+
54
+ def response_nodes_2_df(self, response: Response) -> pd.DataFrame:
55
+ """
56
+ This method returns the response node content as a DataFrame
57
+
58
+ :param: Response object to be formatted as a pandas DataFrame
59
+
60
+ :return: Response object as a pandas DataFrame
61
+ """
62
+ data_list = []
63
+ for ind, node in enumerate(response.__dict__["source_nodes"]):
64
+ split_node_text = node.node.text.split("\nResponse: \n")
65
+ subquery = split_node_text[0]
66
+ sub_response = split_node_text[-1]
67
+ data_list.append({"id": f"Data Scientist {ind + 1}", "response": sub_response, "subquery": subquery, })
68
+
69
+ df = pd.DataFrame(data_list)
70
+
71
+ return df
72
+
73
+ def visualize_skill_graph(self, response_df: pd.DataFrame):
74
+ """
75
+ This method visualizes the SKill Graph
76
+
77
+ :param response_df: DataFrame containing the Query Responses per PDF index
78
+ """
79
+ skills_2_list = SkillsToList(model_name=self.config.SkillsToListConfig.llm_model_name,
80
+ temperature=self.config.SkillsToListConfig.temperature,
81
+ prompt_template_path=self.config.SkillsToListConfig.prompt_template)
82
+
83
+ skill_lists = skills_2_list(skill_descriptions=response_df.response.tolist())
84
+
85
+ esco_api = EscoSkillApi(sbert_model=self.sbert_model,
86
+ index_name=self.config.EscoSkillApiConfig.index_name,
87
+ top_k=self.config.EscoSkillApiConfig.top_k)
88
+
89
+ skill_lists_2_graph = []
90
+ for ind, skill_list in enumerate(skill_lists):
91
+ normalized_skills = esco_api.run_queries(queries=skill_list)
92
+ skill_lists_2_graph.append({
93
+ "id": f"Data Scientist {ind + 1}",
94
+ "skills": normalized_skills
95
+ })
96
+
97
+ skill_graph = SkillGraph(graph_type="spring")
98
+ skill_graph.plot_skill_graph(skill_lists=skill_lists_2_graph)
multi_index_demo/response_clustering.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.cluster.hierarchy import linkage, dendrogram
2
+ import streamlit as st
3
+ import matplotlib.pyplot as plt
4
+ from sentence_transformers import SentenceTransformer
5
+ import numpy as np
6
+ from sklearn.cluster import AgglomerativeClustering
7
+ from dataclasses import dataclass
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import pandas as pd
10
+ import seaborn as sns
11
+
12
+ @dataclass
13
+ class ClusterResult:
14
+ similarity_df: pd.DataFrame
15
+ cluster_df: pd.DataFrame
16
+
17
+ class ResponseClustering:
18
+ """
19
+ This class performs clustering of the Query Responses
20
+ """
21
+ def __init__(self, sbert_model: SentenceTransformer):
22
+ self.sbert_model = sbert_model
23
+
24
+ def compute_response_clusters(self, response_df: pd.DataFrame) -> ClusterResult:
25
+ """
26
+ This method encodes the responses via SBERT, computes the cosine similarity of the
27
+
28
+ :param response_df: This is a DataFrame containing the query responses
29
+
30
+ :return: ClusterResult object with the cosine similarity DataFrame and the Cluster Result DataFrame
31
+ """
32
+ embeddings = self.sbert_model.encode(response_df.response.tolist())
33
+
34
+ self.plot_cluster_dendrogram(encodings=embeddings)
35
+
36
+ # Compute the cosine similarity of the embeddings and plot the heatmap
37
+ similarity_mat = cosine_similarity(embeddings)
38
+ cosine_sim_df = pd.DataFrame(similarity_mat, columns=response_df.id, index=response_df.id)
39
+ self.plot_heatmap(similarity_df=cosine_sim_df)
40
+
41
+ # Perform agglomerative clustering and plot the dendrogram
42
+ clustering = AgglomerativeClustering().fit(embeddings)
43
+ response_df["cluster_labels"] = clustering.labels_
44
+ response_df = response_df.sort_values("cluster_labels")
45
+ cluster_result = ClusterResult(similarity_df=cosine_sim_df, cluster_df=response_df)
46
+
47
+ return cluster_result
48
+
49
+ def plot_cluster_dendrogram(self, encodings: np.ndarray):
50
+ # Calculate the linkage: mergings
51
+ mergings = linkage(encodings, method='ward')
52
+
53
+ fig, ax = plt.subplots()
54
+ # Plot the dendrogram, using varieties as labels
55
+ fig.tight_layout()
56
+
57
+ dendrogram(mergings,
58
+ labels=[f"skills Data Scientist {i + 1}" for i in range(encodings.shape[0])],
59
+ leaf_rotation=90,
60
+ leaf_font_size=6,
61
+ )
62
+
63
+ plt.title("Clustering Dendrogram of the Data Scientist query-response embeddings")
64
+ plt.xticks(rotation = 0)
65
+ # plt.show()
66
+ st.pyplot(fig)
67
+
68
+ def plot_heatmap(self, similarity_df: pd.DataFrame):
69
+ fig1, ax1 = plt.subplots()
70
+ # Plot the dendrogram, using varieties as labels
71
+ fig1.tight_layout()
72
+ plt.title("Cosine-similarity heatmap of the Data Scientist query-response embeddings")
73
+ sns.heatmap(similarity_df, cmap="viridis")
74
+ st.pyplot(fig1)
multi_index_demo/streamlit_utils.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import fitz
3
+ from pathlib import Path
4
+ import re
5
+ from typing import List, Dict
6
+
7
+ def local_css(file_name):
8
+ with open(file_name) as f:
9
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
10
+
11
+ def remote_css(url):
12
+ st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)
13
+
14
+ def icon(icon_name):
15
+ st.markdown(f'<i class="material-icons">{icon_name}</i>', unsafe_allow_html=True)
16
+
17
+ @st.cache_data
18
+ def load_pdf_files(files: List) -> List[Dict]:
19
+ """
20
+ This method loads and caches the content of PDF Files based on a list of file objects
21
+
22
+ :param files: File objects
23
+
24
+ :return: List of pdf content
25
+ """
26
+ content_list = []
27
+ if len(files) > 0:
28
+ for ind, file in enumerate(files):
29
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
30
+ page_texts = []
31
+ for page in doc:
32
+ page_text = page.get_text()
33
+ page_texts.append(page_text)
34
+
35
+ page_text = "\n".join(page_texts)
36
+
37
+ cv_id = ind + 1
38
+ title = re.sub("\.pdf", "", file.name)
39
+ title = f"{title} Data Scientist {cv_id}"
40
+ engine_name = re.sub(" ", "_", title)
41
+ tmp_engine_name = f"{engine_name}_query_engine"
42
+ tmp_index_name = f"{engine_name}_index"
43
+
44
+ content_list.append({
45
+ "engine_name": tmp_engine_name,
46
+ "index_name": tmp_index_name,
47
+ "text": page_text,
48
+ "title": title
49
+ })
50
+
51
+ return content_list
multi_index_demo/style.css ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ color: #fff;
3
+ background-color: #4F8BF9;
4
+ }
5
+
6
+ .stButton>button {
7
+ color: #4F8BF9;
8
+ border-radius: 50%;
9
+ height: 3em;
10
+ width: 3em;
11
+ }
12
+
13
+ .stTextInput>div>div>input {
14
+ color: #4F8BF9;
15
+ }
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "multi-index-demo"
3
+ version = "0.1.2"
4
+ description = "This repository can be used as a demo for multi-index question answering"
5
+ authors = ["christoph_hiemenz <chiemenz@positivethinking.tech>"]
6
+ readme = "README.md"
7
+ packages = [{include = "multi_index_demo"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = ">=3.10,<3.13"
11
+ streamlit = "^1.26.0"
12
+ llama-index = "^0.8.11.post3"
13
+ sentence-transformers = "^2.2.2"
14
+ networkx = "^3.1"
15
+ scikit-learn = "^1.3.0"
16
+ scipy = "^1.11.2"
17
+ pandas = "^2.0.3"
18
+ numpy = "^1.25.2"
19
+ matplotlib = "^3.7.2"
20
+ pymupdf = "^1.23.1"
21
+ faiss-cpu = "^1.7.4"
22
+ seaborn = "^0.12.2"
23
+ python-dotenv = "^1.0.0"
24
+
25
+ [tool.pytest.ini_options]
26
+ minversion = "6.0"
27
+ addopts = "-ra -q"
28
+ testpaths = [
29
+ "tests"
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["poetry-core"]
34
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.26.0
2
+ llama-index==0.8.11.post3
3
+ sentence-transformers==2.2.2
4
+ networkx==3.1
5
+ scikit-learn==1.3.0
6
+ scipy==1.11.2
7
+ pandas==2.0.3
8
+ numpy==1.25.2
9
+ matplotlib==3.7.2
10
+ pymupdf==1.23.1
11
+ faiss-cpu==1.7.4
12
+ seaborn==0.12.2
13
+ python-dotenv==1.0.0
tests/__init__.py ADDED
File without changes
tests/test_version.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from multi_index_demo import __version__
2
+
3
+ def test_version():
4
+ print(__version__)
5
+ assert __version__ == "0.1.1"