Spaces:

jaothan
/

llm_serie2_RAG_Multi_Document

Sleeping

App Files Files Community

jaothan commited on Feb 17, 2025

Commit

5559758

verified ·

1 Parent(s): 458a630

Upload 27 files

Browse files

Files changed (28) hide show

.gitattributes +2 -0
.gitignore +119 -0
LICENSE +201 -0
assets/indexing_stage_positive_thinking_company.png +0 -0
assets/multi_index_queries_positive_thinking_company.png +3 -0
assets/rag_overview_positive_thinking_company.png +3 -0
multi_index_demo/__init__.py +1 -0
multi_index_demo/app.py +61 -0
multi_index_demo/app_config.yaml +29 -0
multi_index_demo/config.py +86 -0
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Anna Mustermann.pdf +0 -0
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae John Doe.pdf +0 -0
multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Max Mustermann.pdf +0 -0
multi_index_demo/esco_skill_graph/esco_skill_extractor.py +69 -0
multi_index_demo/esco_skill_graph/esco_skill_graph.py +89 -0
multi_index_demo/esco_skill_graph/esco_skill_mapping.py +67 -0
multi_index_demo/esco_skill_graph/skill_list_prompt.txt +18 -0
multi_index_demo/indexing_utils.py +107 -0
multi_index_demo/paths.py +3 -0
multi_index_demo/query_executers.py +98 -0
multi_index_demo/response_clustering.py +74 -0
multi_index_demo/streamlit_utils.py +51 -0
multi_index_demo/style.css +15 -0
poetry.lock +0 -0
pyproject.toml +34 -0
requirements.txt +13 -0
tests/__init__.py +0 -0
tests/test_version.py +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/multi_index_queries_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text
+assets/rag_overview_positive_thinking_company.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,119 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Data
+*.json
+*.jsonl
+*.pickle
+*.xlsx
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+#*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+# idea
+.idea/*
+.idea
+idea/
+# Exclude model.bin files from repo
+*.bin
+*vocab.txt

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

assets/indexing_stage_positive_thinking_company.png ADDED Viewed

assets/multi_index_queries_positive_thinking_company.png ADDED Viewed

Git LFS Details

SHA256: cd4e430c1c6257673de8d29ebf3b8b91436da2afcc87515b709214ac0d395b8e
Pointer size: 131 Bytes
Size of remote file: 195 kB

assets/rag_overview_positive_thinking_company.png ADDED Viewed

Git LFS Details

SHA256: f94d88d633942c5bb21d9e083cb5ed54f5884f54de00baeed5801f5494240a96
Pointer size: 131 Bytes
Size of remote file: 145 kB

multi_index_demo/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.2"

multi_index_demo/app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import streamlit as st
+import os
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from streamlit_utils import local_css, remote_css, load_pdf_files
+from config import load_app_config, set_global_api_key
+from indexing_utils import ServiceContextLoader, create_multi_index
+from query_executers import QueryExecuter
+def main():
+    """
+    This is the main method of the streamlit application
+    """
+    # Set the OpenAI Api key as an environment variable
+    set_global_api_key()
+    dirname = Path(os.path.dirname(__file__))
+    local_css((dirname /"style.css").as_posix())
+    remote_css('https://fonts.googleapis.com/icon?family=Material+Icons')
+    # Load a Configuration object for the application
+    app_config = load_app_config()
+    # Initialize a ServiceContext for the QueryEngine
+    service_context = ServiceContextLoader(app_config=app_config).load()
+    # Initialize a simple SentenceTransformer model for clustering the final responses
+    sbert_model = SentenceTransformer(app_config.ClusteringConfig.SentenceTransformerModel)
+    st.title("Parallel Multi-Document Question Answering")
+    # Provide a file_uploader with drag and drop functionality
+    multiple_files = st.file_uploader(
+        "Drop multiple files:", accept_multiple_files=True
+    )
+    if multiple_files is None:
+        st.text("No upload")
+    else:
+        files = [file for file in multiple_files if str(file.name).endswith(".pdf")]
+    # Load the pdf files based on the file objects
+    file_content_list = load_pdf_files(files=files)
+    if file_content_list:
+        top_k = app_config.QueryEngineConfig.similarity_top_k
+        # Create a multi-index query engine based on the pdf file content
+        multi_index_query_engine = create_multi_index(file_content_list=file_content_list,
+                                                    _service_context=service_context,
+                                                    top_k=top_k)
+        # Execute the query and display the results in the streamlit app
+        query_executer = QueryExecuter(query_engine=multi_index_query_engine,
+                                    sbert_model=sbert_model,
+                                    config=app_config)
+        query_executer.run()
+if __name__ == "__main__":
+    main()

multi_index_demo/app_config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# Configure the Large Language Model Api calls
+LLMConfig:
+  temperature: 0
+  model: "text-davinci-003"
+  max_tokens: 250
+# Configure how the pdf file texts should be chunked into Node texts
+SimpleNodeParser:
+  chunk_size: 1024
+  chunk_overlap: 20
+# Specify how many contexts to retrieve in the retrieval step
+QueryEngineConfig:
+  similarity_top_k: 3
+# Configure the prompt for response synthesis
+PromptHelper:
+  context_window: 4096
+  chunk_overlap_ratio: 0.1
+  chunk_size_limit: null
+# Specify the SentenceTransformer model for semantic clustering of responses
+ClusteringConfig:
+  SentenceTransformerModel: "all-mpnet-base-v2"
+# Specify the Config for the Esco Faiss index skill query API
+EscoSkillApiConfig:
+  index_name: esco_skill_index
+  top_k: 1
+# This is a config for the Api to extract a semicolon separated list of skills from a Skill description text
+SkillsToListConfig:
+  llm_model_name: "text-davinci-003"
+  temperature: 0.0
+  prompt_template: skill_list_prompt.txt

multi_index_demo/config.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import yaml
+from pydantic import BaseModel
+from typing import Optional
+import os
+from pathlib import Path
+import openai
+from dotenv import load_dotenv
+class LLMConfig(BaseModel):
+    """
+    Configuration of the LLM prompt parameters
+    """
+    temperature: float
+    model: str
+    max_tokens: int
+class SimpleNodeParser(BaseModel):
+    """
+    Configuration of the simple Document to Node parser
+    """
+    chunk_size: int
+    chunk_overlap: int
+class PromptHelper(BaseModel):
+    """
+    This is a Prompt Helper config which configures actual constraints of the prompt context size
+    and overlap between chunks
+    """
+    context_window: int
+    chunk_overlap_ratio: float
+    chunk_size_limit: Optional[int]
+class QueryEngineConfig(BaseModel):
+    """
+    Configuration of the query engine
+    """
+    similarity_top_k: int
+class ClusteringConfig(BaseModel):
+    """
+    Configuration of the SentenceTransformer Embedding model for cluster analysis
+    """
+    SentenceTransformerModel: str
+class EscoSkillApiConfig(BaseModel):
+    index_name: str
+    top_k: int
+class SkillsToListConfig(BaseModel):
+    llm_model_name: str
+    temperature: float
+    prompt_template: str
+class AppConfig(BaseModel):
+    """
+    This is a basic config object for the Streamlit Application
+    """
+    LLMConfig: LLMConfig
+    PromptHelper: PromptHelper
+    SimpleNodeParser: SimpleNodeParser
+    QueryEngineConfig: QueryEngineConfig
+    ClusteringConfig: ClusteringConfig
+    EscoSkillApiConfig: EscoSkillApiConfig
+    SkillsToListConfig: SkillsToListConfig
+def load_app_config() -> AppConfig:
+    """
+    This method loads the AppConfig object
+    :return: Initialized AppConfig object
+    """
+    dirname = Path(os.path.dirname(__file__))
+    with (dirname / "app_config.yaml").open("r", encoding="utf-8") as f:
+        app_config_dict = yaml.safe_load(f)
+    app_config_obj = AppConfig(**app_config_dict)
+    return app_config_obj
+def set_global_api_key():
+    """
+    This method sets the API key globally
+    """
+    load_dotenv()
+    openai.api_key = os.environ["OPENAI_API_KEY"]

multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Anna Mustermann.pdf ADDED Viewed

Binary file (78.8 kB). View file

multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae John Doe.pdf ADDED Viewed

Binary file (78.8 kB). View file

multi_index_demo/data/cv_comparison_pdf/Curriculum Vitae Max Mustermann.pdf ADDED Viewed

Binary file (79.2 kB). View file

multi_index_demo/esco_skill_graph/esco_skill_extractor.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from langchain import PromptTemplate
+from langchain.llms import OpenAI
+import os
+from config import set_global_api_key
+from pathlib import Path
+from typing import List
+from paths import REPO_DIR_PATH
+set_global_api_key()
+class SkillsToList:
+    """
+    This is a query engine which does extract skills from plain text skill descriptions
+    """
+    def __init__(self, model_name: str, temperature: float, prompt_template_path: str):
+        """
+        :param model_name: Name of the LLM to be used for prompting
+        :param temperature: Temperature prompt parameter, high - explorative, low - conservative
+        :param prompt_template_path: Filepath to the prompt template
+        """
+        # initialize the LLM Api
+        self.openai_engine = OpenAI(
+            model_name=model_name,
+            openai_api_key=os.environ["OPENAI_API_KEY"],
+            temperature=temperature
+        )
+        # Read a prompt template for skill extraction from a list of skills
+        prompt_path =  REPO_DIR_PATH / "esco_skill_graph" / prompt_template_path
+        with prompt_path.open("r", encoding="utf-8") as f:
+            template = f.read()
+        # Create a prompt template
+        self.prompt_template = PromptTemplate(
+            input_variables=["context"],
+            template=template
+        )
+    def extract_skill_list(self, skill_description: str) -> List[str]:
+        """"
+        This method extracts a list of skills
+        :param skill_description: A Descriptive text outlining Soft-skills & Hard-Skills
+        :return: List of skills
+        """
+        prompt = self.prompt_template.format(context=skill_description)
+        result = self.openai_engine(prompt)
+        skills = [skill.strip() for skill in result.strip().split(";")]
+        return skills
+    def __call__(self, skill_descriptions: List[str]) -> List[List[str]]:
+        """
+        This method returns lists of separate skills
+        :param skill_descriptions: List of plain text skill descriptions
+        :return: Lists of skills
+        """
+        skill_lists = []
+        for skill_desc in skill_descriptions:
+            skills = self.extract_skill_list(skill_description=skill_desc)
+            skill_lists.append(skills)
+        return skill_lists

multi_index_demo/esco_skill_graph/esco_skill_graph.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import pandas as pd
+import networkx as nx
+import matplotlib.pyplot as plt
+import matplotlib
+import streamlit as st
+from typing import List, Dict, Union, Set
+from matplotlib.pyplot import figure, text
+class SkillGraph:
+    """
+    This is a Graph for visualizing the overlap of skills and competencies of different Candidates
+    """
+    def __init__(self, graph_type: str="spring"):
+        assert graph_type in ["spring", "shell"], "graph type must be 'spring' or 'shell'"
+        self.graph_type = graph_type
+    def get_edge_df(self, skill_lists: List[Dict], unique_skills: Set) -> pd.DataFrame:
+        """
+        This method returns a DataFrame with the Edges of the Graph
+        """
+        relationship_list = []
+        for skill_dict in skill_lists:
+            candidate_id = skill_dict.get("id")
+            intersections = unique_skills.intersection(set(skill_dict.get("skills")))
+            for skill in list(set(intersections)):
+                relationship_list.append({"from": candidate_id, "to": skill})
+        relationships = pd.DataFrame(relationship_list)
+        return relationships
+    def get_node_df(self, unique_skills: Set, skill_lists: List[Dict]) -> pd.DataFrame:
+        """
+        This method returns a DataFrame with the nodes of the Graph
+        """
+        candidate_nodes = [{"ID": skill_dict.get("id"), "type": "candidate"} for skill_dict in skill_lists]
+        skill_nodes =  [{"ID": skill, "type": "skill"} for skill in unique_skills]
+        nodes = candidate_nodes + skill_nodes
+        carac = pd.DataFrame(nodes)
+        return carac
+    def plot_skill_graph(self, skill_lists: List[Dict]):
+        """
+        This method plots the actual skill graph based on a list of provided skills per candidate
+        :param skill_list: Skills per candidate
+        """
+        unique_skills = set([skill for skills in skill_lists for skill in skills.get("skills")])
+        relationships = self.get_edge_df(skill_lists=skill_lists, unique_skills=unique_skills)
+        carac = self.get_node_df(skill_lists=skill_lists, unique_skills=unique_skills)
+        # Set overall figure size
+        fig, ax = plt.subplots()
+        fig.tight_layout()
+        # Create graph object
+        G = nx.from_pandas_edgelist(relationships, 'from', 'to', create_using=nx.Graph())
+        # Make types into categories
+        carac= carac.set_index('ID')
+        carac=carac.reindex(G.nodes())
+        carac['type']=pd.Categorical(carac['type'])
+        carac['type'].cat.codes
+        # Set node colors
+        cmap = matplotlib.colors.ListedColormap(['dodgerblue', 'lightgray']) #, 'darkorange'])
+        # Set node sizes
+        node_sizes = [1000 if entry == 'candidate' else 250 for entry in carac.type]
+        if self.graph_type == "spring":
+            pos = nx.spring_layout(G)
+            # Create Layouts
+            nx.draw(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
+                    node_size = node_sizes, edgecolors='gray')
+        elif self.graph_type == "shell":
+            pos = nx.shell_layout(G)
+            nx.draw_shell(G, pos=pos, with_labels=False, node_color=carac['type'].cat.codes, cmap=cmap,
+            node_size = node_sizes, edgecolors='gray')
+        for node, (x, y) in pos.items():
+            text(x, y, node, fontsize=8, ha='center', va='center')
+        plt.title('European Skills, Competences, Qualifications and Occupations (ESCO) skill network', fontsize=14)
+        st.pyplot(fig)

multi_index_demo/esco_skill_graph/esco_skill_mapping.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+from typing import List, Union
+import logging
+import os
+from paths import REPO_DIR_PATH
+logging.basicConfig(level=logging.DEBUG)
+class EscoSkillApi:
+    """
+    The EscoSkillApi object
+    """
+    def __init__(self,
+                sbert_model: SentenceTransformer,
+                index_name: str,
+                top_k: int):
+        self.esco_skills = self.load_esco_dataset(filepath=os.environ["ESCO_NER_SEARCHTERMS"])
+        self.sbert_model = sbert_model
+        self.index_name = index_name
+        try:
+            self.index = faiss.read_index((REPO_DIR_PATH / "esco_skill_graph"/ self.index_name).as_posix())
+        except:
+            self.create_index()
+        self.top_k = top_k
+    def load_esco_dataset(self, filepath: str) -> List[str]:
+        """
+        This method loads a Dataset with the European Skills and Competencies
+        :param filepath: Filepath to the ESCO dataset
+        :return: List of ESCO skills
+        """
+        skill_search_df = pd.read_csv(filepath)
+        esco_skills = list(sorted(set(skill_search_df.skill.astype(str).tolist())))
+        return esco_skills
+    def create_index(self):
+        encoded_data = self.sbert_model.encode(self.esco_skills)
+        self.index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
+        self.index.add_with_ids(encoded_data, np.array(range(0, len(self.esco_skills))))
+        faiss.write_index(self.index, 'esco_skill_index')
+    def run_query(self, query: str) -> Union[List, str]:
+        query_vector = self.sbert_model.encode([query])
+        top_k = self.index.search(query_vector, self.top_k)
+        results = [self.esco_skills[_id] for _id in top_k[1].tolist()[0]]
+        if top_k == 1:
+            return results[0]
+        else:
+            return results
+    def run_queries(self, queries: List[str]):
+        query_results = []
+        for query in queries:
+            res = self.run_query(query=query)
+            query_results.append(res[0])
+        return query_results

multi_index_demo/esco_skill_graph/skill_list_prompt.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+Act as if you are an HR recruiter reading the CV of a candidate and extracting a list
+of skills from a Curriculum Vitae.
+Provide a semicolon separated list of skills based on a provided CV.
+Examples:
+Context: The hard-skills of Data Scientist 1 are: python, R, Databricks, Docker, Tableau, PowerBI, AWS, Azure, Kubernetes
+Skills: python; R; Databricks; Docker; Tableau; PowerBI; AWS; Azure; Kubernetes
+Context: Hard-skills of Data Scientist 2 include: Programming Languages (Python, C#, Java, JavaScript, HTML/CSS), Problem-solving, Teamwork, Communication, Analytical Thinking.
+Skills: Python; C#; Java; JAvaScript; HTML/CSS; Problem-Solving; Teamwork; Communication; Analytical Thinking
+Context: {context}
+Skills:

multi_index_demo/indexing_utils.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from llama_index import ServiceContext, PromptHelper
+from llama_index.llms import OpenAI
+from llama_index.embeddings import OpenAIEmbedding
+from llama_index.node_parser import SimpleNodeParser
+from config import AppConfig
+import streamlit as st
+from llama_index import ServiceContext, Document, PromptHelper
+from llama_index.llms import OpenAI
+from llama_index.embeddings import OpenAIEmbedding
+from llama_index.tools import QueryEngineTool, ToolMetadata
+from llama_index.query_engine import SubQuestionQueryEngine
+from llama_index import  GPTVectorStoreIndex
+from llama_index.node_parser import SimpleNodeParser
+from typing import List, Dict
+class ServiceContextLoader:
+    """
+    This is a simple loader for the ServiceContext
+    """
+    def load(self) -> ServiceContext:
+        """
+        This method loads the ServiceContext
+        :return: Initialized ServiceContext for the Streamlit application
+        """
+        # Initialize an LLM Api Wrapper
+        llm = OpenAI(temperature=self.app_config.LLMConfig.temperature,
+                model=self.app_config.LLMConfig.model,
+                max_tokens=self.app_config.LLMConfig.max_tokens)
+        # Initialize an LLM Embedding Model
+        embed_model = OpenAIEmbedding()
+        # Initialize a NodeParser from Documents to Nodes
+        node_parser = SimpleNodeParser.from_defaults(chunk_size=self.app_config.SimpleNodeParser.chunk_size,
+                                                    chunk_overlap=self.app_config.SimpleNodeParser.chunk_overlap)
+        # Initialize a PromptHelper with the prompt parameters
+        prompt_helper = PromptHelper(
+            context_window=self.app_config.PromptHelper.context_window,
+            # num_output=256,
+            chunk_overlap_ratio=self.app_config.PromptHelper.chunk_overlap_ratio,
+            chunk_size_limit=self.app_config.PromptHelper.chunk_size_limit
+        )
+        # Initialize a ServiceContext for the query engine including the LLM, Embedding, NodeParser and PromptHelper
+        service_context = ServiceContext.from_defaults(
+            llm=llm,
+            embed_model=embed_model,
+            node_parser=node_parser,
+            prompt_helper=prompt_helper
+        )
+        return service_context
+    def __init__(self, app_config: AppConfig):
+        self.app_config = app_config
+@st.cache_resource
+def create_multi_index(file_content_list: List[Dict], _service_context: ServiceContext,
+                    top_k: int=3) -> SubQuestionQueryEngine:
+    """
+    This method creates a SubQuestionQueryEngine Multi-Index based on the indices for individual pdf pages
+    :param app_config: AppConfig object configuring the Streamlit application
+    :param file_content_list: List with the content per pdf file
+    :return: Multi-index query engine
+    """
+    file_2_index = {}
+    file_2_engine = {}
+    for file_content in file_content_list:
+        documents = [Document(text=file_content.get('text'))]
+        index_name = file_content.get('index_name')
+        engine_name = file_content.get('engine_name')
+        title = file_content.get('title')
+        # Initialize independently named GPTVectorStoreIndex objects on the fly
+        # e.g. index = GPTVectorStoreIndex.from_documents(documents)
+        exec(f"{index_name} = GPTVectorStoreIndex.from_documents(documents)")
+        # Initialize independently named query engines on the fly
+        # e.g. engine = index.as_query_engine(service_context=_service_context, similarity_top_k={top_k})
+        exec(f"{engine_name} = {index_name}.as_query_engine(service_context=_service_context, similarity_top_k={top_k})")
+        # Store each index and query engine in a dictionary
+        exec(f"file_2_index[title] = {index_name}")
+        exec(f"file_2_engine[title] = {engine_name}")
+    # Define a List of QueryEngineTools wrapping all individual pdf file indices
+    query_engine_tools = [
+        QueryEngineTool(
+        query_engine=engine,
+        metadata=ToolMetadata(
+            name=title.replace(" ", "_"),
+            description=title,
+            ),
+        )
+        for title, engine in file_2_engine.items()
+    ]
+    # Initialize a multi-index query engine based on all QueryEngineTools
+    s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)
+    return s_engine

multi_index_demo/paths.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from pathlib import Path
2	+
3	+ REPO_DIR_PATH = Path(__file__).parent

multi_index_demo/query_executers.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from llama_index.query_engine import SubQuestionQueryEngine
+import streamlit as st
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from streamlit_utils import icon
+from esco_skill_graph.esco_skill_extractor import SkillsToList
+from esco_skill_graph.esco_skill_mapping import EscoSkillApi
+from esco_skill_graph.esco_skill_graph import SkillGraph
+from config import AppConfig
+from response_clustering import ResponseClustering
+from llama_index.response.schema import Response
+class QueryExecuter:
+    """
+    This class executes queries against the multi-index and summarizes the results
+    """
+    def __init__(self, query_engine: SubQuestionQueryEngine,
+                sbert_model: SentenceTransformer,
+                config: AppConfig):
+        self.query_engine = query_engine
+        self.sbert_model = sbert_model
+        self.config = config
+    def run(self):
+        """
+        This QueryExecuter runs a query based on the provided input text and
+        visualizes the query results
+        """
+        query_text = st.text_input("", "Search...")
+        icon("search")
+        button_clicked = st.button("OK")
+        if button_clicked:
+            response = self.query_engine.query(str(query_text))
+            st.title("Raw Search results: ")
+            st.write(f"**Query: {query_text}**")
+            response_df = self.response_nodes_2_df(response=response)
+            st.markdown("""**Raw Response**""")
+            st.write(f"""{response.response}""")
+            st.write(response_df)
+            resp_clustering = ResponseClustering(sbert_model=self.sbert_model)
+            resp_clustering.compute_response_clusters(response_df=response_df)
+            if "skill" in query_text.lower():
+                st.title("Network analysis of skills")
+                self.visualize_skill_graph(response_df=response_df)
+    def response_nodes_2_df(self, response: Response) -> pd.DataFrame:
+        """
+        This method returns the response node content as a DataFrame
+        :param: Response object to be formatted as a pandas DataFrame
+        :return: Response object as a pandas DataFrame
+        """
+        data_list = []
+        for ind, node in enumerate(response.__dict__["source_nodes"]):
+            split_node_text = node.node.text.split("\nResponse: \n")
+            subquery = split_node_text[0]
+            sub_response = split_node_text[-1]
+            data_list.append({"id": f"Data Scientist {ind + 1}", "response": sub_response, "subquery": subquery, })
+        df = pd.DataFrame(data_list)
+        return df
+    def visualize_skill_graph(self, response_df: pd.DataFrame):
+        """
+        This method visualizes the SKill Graph
+        :param response_df: DataFrame containing the Query Responses per PDF index
+        """
+        skills_2_list = SkillsToList(model_name=self.config.SkillsToListConfig.llm_model_name,
+                                    temperature=self.config.SkillsToListConfig.temperature,
+                                    prompt_template_path=self.config.SkillsToListConfig.prompt_template)
+        skill_lists = skills_2_list(skill_descriptions=response_df.response.tolist())
+        esco_api = EscoSkillApi(sbert_model=self.sbert_model,
+                        index_name=self.config.EscoSkillApiConfig.index_name,
+                        top_k=self.config.EscoSkillApiConfig.top_k)
+        skill_lists_2_graph = []
+        for ind, skill_list in enumerate(skill_lists):
+            normalized_skills = esco_api.run_queries(queries=skill_list)
+            skill_lists_2_graph.append({
+            "id": f"Data Scientist {ind + 1}",
+            "skills": normalized_skills
+            })
+        skill_graph = SkillGraph(graph_type="spring")
+        skill_graph.plot_skill_graph(skill_lists=skill_lists_2_graph)

multi_index_demo/response_clustering.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from scipy.cluster.hierarchy import linkage, dendrogram
+import streamlit as st
+import matplotlib.pyplot as plt
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.cluster import AgglomerativeClustering
+from dataclasses import dataclass
+from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import seaborn as sns
+@dataclass
+class ClusterResult:
+    similarity_df: pd.DataFrame
+    cluster_df: pd.DataFrame
+class ResponseClustering:
+    """
+    This class performs clustering of the Query Responses
+    """
+    def __init__(self, sbert_model: SentenceTransformer):
+        self.sbert_model = sbert_model
+    def compute_response_clusters(self, response_df: pd.DataFrame) -> ClusterResult:
+        """
+        This method encodes the responses via SBERT, computes the cosine similarity of the
+        :param response_df: This is a DataFrame containing the query responses
+        :return: ClusterResult object with the cosine similarity DataFrame and the Cluster Result DataFrame
+        """
+        embeddings = self.sbert_model.encode(response_df.response.tolist())
+        self.plot_cluster_dendrogram(encodings=embeddings)
+        # Compute the cosine similarity of the embeddings and plot the heatmap
+        similarity_mat = cosine_similarity(embeddings)
+        cosine_sim_df = pd.DataFrame(similarity_mat, columns=response_df.id, index=response_df.id)
+        self.plot_heatmap(similarity_df=cosine_sim_df)
+        # Perform agglomerative clustering and plot the dendrogram
+        clustering = AgglomerativeClustering().fit(embeddings)
+        response_df["cluster_labels"] = clustering.labels_
+        response_df = response_df.sort_values("cluster_labels")
+        cluster_result = ClusterResult(similarity_df=cosine_sim_df, cluster_df=response_df)
+        return cluster_result
+    def plot_cluster_dendrogram(self, encodings: np.ndarray):
+        # Calculate the linkage: mergings
+        mergings = linkage(encodings, method='ward')
+        fig, ax = plt.subplots()
+        # Plot the dendrogram, using varieties as labels
+        fig.tight_layout()
+        dendrogram(mergings,
+                labels=[f"skills Data Scientist {i + 1}" for i in range(encodings.shape[0])],
+                leaf_rotation=90,
+                leaf_font_size=6,
+                )
+        plt.title("Clustering Dendrogram of the Data Scientist query-response embeddings")
+        plt.xticks(rotation = 0)
+        # plt.show()
+        st.pyplot(fig)
+    def plot_heatmap(self, similarity_df: pd.DataFrame):
+        fig1, ax1 = plt.subplots()
+        # Plot the dendrogram, using varieties as labels
+        fig1.tight_layout()
+        plt.title("Cosine-similarity heatmap of the Data Scientist query-response embeddings")
+        sns.heatmap(similarity_df, cmap="viridis")
+        st.pyplot(fig1)

multi_index_demo/streamlit_utils.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import streamlit as st
+import fitz
+from pathlib import Path
+import re
+from typing import List, Dict
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+def remote_css(url):
+    st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)
+def icon(icon_name):
+    st.markdown(f'<i class="material-icons">{icon_name}</i>', unsafe_allow_html=True)
+@st.cache_data
+def load_pdf_files(files: List) -> List[Dict]:
+    """
+    This method loads and caches the content of PDF Files based on a list of file objects
+    :param files: File objects
+    :return: List of pdf content
+    """
+    content_list = []
+    if len(files) > 0:
+        for ind, file in enumerate(files):
+            with fitz.open(stream=file.read(), filetype="pdf") as doc:
+                page_texts = []
+                for page in doc:
+                    page_text = page.get_text()
+                    page_texts.append(page_text)
+                page_text = "\n".join(page_texts)
+            cv_id = ind + 1
+            title = re.sub("\.pdf", "", file.name)
+            title = f"{title} Data Scientist {cv_id}"
+            engine_name = re.sub(" ", "_", title)
+            tmp_engine_name = f"{engine_name}_query_engine"
+            tmp_index_name = f"{engine_name}_index"
+            content_list.append({
+                "engine_name": tmp_engine_name,
+                "index_name": tmp_index_name,
+                "text": page_text,
+                "title": title
+            })
+    return content_list

multi_index_demo/style.css ADDED Viewed

	@@ -0,0 +1,15 @@

+body {
+    color: #fff;
+    background-color: #4F8BF9;
+}
+.stButton>button {
+    color: #4F8BF9;
+    border-radius: 50%;
+    height: 3em;
+    width: 3em;
+}
+.stTextInput>div>div>input {
+    color: #4F8BF9;
+}

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[tool.poetry]
+name = "multi-index-demo"
+version = "0.1.2"
+description = "This repository can be used as a demo for multi-index question answering"
+authors = ["christoph_hiemenz <chiemenz@positivethinking.tech>"]
+readme = "README.md"
+packages = [{include = "multi_index_demo"}]
+[tool.poetry.dependencies]
+python = ">=3.10,<3.13"
+streamlit = "^1.26.0"
+llama-index = "^0.8.11.post3"
+sentence-transformers = "^2.2.2"
+networkx = "^3.1"
+scikit-learn = "^1.3.0"
+scipy = "^1.11.2"
+pandas = "^2.0.3"
+numpy = "^1.25.2"
+matplotlib = "^3.7.2"
+pymupdf = "^1.23.1"
+faiss-cpu = "^1.7.4"
+seaborn = "^0.12.2"
+python-dotenv = "^1.0.0"
+[tool.pytest.ini_options]
+minversion = "6.0"
+addopts = "-ra -q"
+testpaths = [
+    "tests"
+]
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+streamlit==1.26.0
+llama-index==0.8.11.post3
+sentence-transformers==2.2.2
+networkx==3.1
+scikit-learn==1.3.0
+scipy==1.11.2
+pandas==2.0.3
+numpy==1.25.2
+matplotlib==3.7.2
+pymupdf==1.23.1
+faiss-cpu==1.7.4
+seaborn==0.12.2
+python-dotenv==1.0.0

tests/__init__.py ADDED Viewed

File without changes

tests/test_version.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from multi_index_demo import __version__
+def test_version():
+    print(__version__)
+    assert __version__ == "0.1.1"