Spaces:
Paused
Paused
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from sentence_transformers import SentenceTransformer, util | |
| from langchain.docstore.document import Document | |
| import numpy as np | |
| from config import * | |
| import os | |
| os.environ['CURL_CA_BUNDLE'] = "" | |
| embedding_int = HuggingFaceBgeEmbeddings( | |
| model_name=MODEL_NAME, | |
| encode_kwargs=ENCODE_KWARGS, | |
| query_instruction=QUERY_INSTRUCTION | |
| ) | |
| embedding_sim = HuggingFaceBgeEmbeddings( | |
| model_name=MODEL_NAME, | |
| encode_kwargs=ENCODE_KWARGS, | |
| query_instruction='Retrieve semantically similar text.' | |
| ) | |
| db = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_int) | |
| retriever = db.as_retriever(search_kwargs={"k": TOP_K}) | |
| def find_similar_occupation(target_occupation_query, berufe, top_k, similarity_func): | |
| # Pro kurs wird ein Document erstellt. Dieses enthält Metadaten sowie einen page_content. | |
| # Der Inhalt von page_content wird embedded und so für die sucher verwendet. | |
| docs = [] | |
| for index, beruf in berufe.iterrows(): | |
| # Create document. | |
| doc = Document( | |
| page_content= beruf['short name'] + ' ' + beruf['full name'] + ' ' + beruf['description'], | |
| metadata={ | |
| "id": beruf["id"], | |
| "name": beruf['short name'], | |
| "description": beruf["description"], | |
| "entry_requirements": beruf["entry requirements"] | |
| }, | |
| ) | |
| docs.append(doc) | |
| db_temp = Chroma.from_documents(documents = docs, embedding= embedding_sim, collection_metadata = {"hnsw:space": similarity_func}) | |
| # Retriever will search for the top_5 most similar documents to the query. | |
| retriever_temp = db_temp.as_retriever(search_kwargs={"k": top_k}) | |
| top_similar_occupations = retriever_temp.get_relevant_documents(target_occupation_query) | |
| return top_similar_occupations |