Spaces:
Runtime error
Runtime error
| # import gradio as gr | |
| # gr.load("models/BAAI/bge-m3").launch() | |
| import json | |
| import faiss | |
| import numpy as np | |
| import gradio as gr | |
| import torch | |
| from FlagEmbedding import BGEM3FlagModel | |
| import os | |
| # Define a function to load the ISCO taxonomy | |
| def load_isco_taxonomy(file_path: str) -> list: | |
| with open(file_path, "r", encoding="utf-8") as file: | |
| isco_data = [json.loads(line.strip()) for line in file] | |
| return isco_data | |
| # Define a function to create a FAISS index | |
| def create_faiss_index(isco_taxonomy, model_name="BAAI/bge-m3"): | |
| model = BGEM3FlagModel( | |
| model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu" | |
| ) | |
| texts = [str(entry["ESCO_DESCRIPTION"]) for entry in isco_taxonomy] | |
| embeddings = model.encode( | |
| texts, | |
| batch_size=12, | |
| max_length=128, | |
| return_dense=True, | |
| return_sparse=True, | |
| return_colbert_vecs=True, | |
| )["dense_vecs"] | |
| embeddings = np.array(embeddings).astype("float32") | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| faiss.write_index(index, "/data/isco_taxonomy.index") | |
| with open("/data/isco_taxonomy_mapping.json", "w") as f: | |
| json.dump({i: entry for i, entry in enumerate(isco_taxonomy)}, f) | |
| # Define a function to retrieve and rerank using FAISS | |
| def retrieve_and_rerank_faiss(job, model_name="BAAI/bge-m3", top_k=8): | |
| # Check if isco_taxonomy.index exists, if not, create it with create_faiss_index | |
| if not os.path.exists("/data/isco_taxonomy.index"): | |
| isco_taxonomy = load_isco_taxonomy("isco_taxonomy.jsonl") | |
| create_faiss_index(isco_taxonomy) | |
| index = faiss.read_index("/data/isco_taxonomy.index") | |
| with open("/data/isco_taxonomy_mapping.json", "r") as f: | |
| isco_taxonomy = json.load(f) | |
| model = BGEM3FlagModel( | |
| model_name, use_fp16=True, device="cuda" if torch.cuda.is_available() else "cpu" | |
| ) | |
| query_embedding = model.encode( | |
| [job], | |
| max_length=128, | |
| return_dense=True, | |
| return_sparse=True, | |
| return_colbert_vecs=True, | |
| )["dense_vecs"] | |
| query_embedding = np.array(query_embedding).astype("float32") | |
| distances, indices = index.search(query_embedding, top_k) | |
| # top_documents = [isco_taxonomy[str(idx)] for idx in indices[0]] | |
| results = [ | |
| [ | |
| float(distances[0][i]), | |
| isco_taxonomy[str(idx)]["ISCO_CODE_4"], | |
| isco_taxonomy[str(idx)]["ISCO_LABEL_4"], | |
| isco_taxonomy[str(idx)]["ESCO_OCCUPATION"], | |
| isco_taxonomy[str(idx)]["ESCO_DESCRIPTION"], | |
| ] | |
| for i, idx in enumerate(indices[0]) | |
| ] | |
| ranked_results = sorted(results, key=lambda x: x[0], reverse=False) | |
| return ranked_results | |
| with gr.Blocks() as demo: | |
| with gr.Row(): | |
| text1 = gr.Textbox(label="Job") | |
| # text2 = gr.Textbox(label="Duties") | |
| # drop1 = gr.Dropdown([4, 6, 8, 10], label="Number of results") | |
| btn = gr.Button("Submit") | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=600): | |
| def greet(job): | |
| return retrieve_and_rerank_faiss(job) | |
| with gr.Accordion(label="Explanation", open=False): | |
| gr.Markdown( | |
| """ | |
| ### Overview of the ESCO rank and retrieve application | |
| The ESCO rank and retrieve application developed using Gradio and the BAAI/BGE-m3 model via a FAISS vector database represents a novel approach in the realm of information retrieval, particularly in the context of occupational classifications such as the ISCO-08 standard. | |
| This application leverages machine learning to semantically process and rank occupation-related documents based on their relevance to user-input job descriptions. | |
| ### How the Application Works | |
| The application is structured into several key components: | |
| 1. **Data preparation:** The ESCO taxonomy data, which includes descriptions of various occupations and corresponding ISCO codes, is initially loaded and processed. This involves reading from a JSON Lines file, ensuring that each entry is correctly formatted and accessible for subsequent operations. | |
| 2. **Embedding generation:** Using the BAAI/BGE-m3 model, which is optimized for multilingual information processing and retrieval tasks, embeddings (high-dimensional vector representations) are generated for each occupation description in the ESCO dataset. These embeddings capture the semantic essence of the text, allowing for meaningful comparisons between texts. | |
| 3. **Index creation and storage:** The generated embeddings are then stored in a Faiss index. [Faiss](https://faiss.ai/) (Facebook AI Similarity Search) is an efficient library for similarity search and clustering of dense vectors. It facilitates rapid retrieval of items whose embeddings are most similar to that of a query vector (e.g., cosine of the angle or euclidian distance between two vectors). | |
| 4. **Retrieval and Ranking:** When a user submits a job title or description of the job through the Gradio interface, the application: | |
| - Generates an embedding for the input using the same BAAI/BGE-m3 model. | |
| - Queries the pre-computed FAISS index to retrieve the closest occupation descriptions based on cosine similarity measures between embeddings. | |
| - Ranks these descriptions according to their similarity scores and presents the results to the user. | |
| ### Advantages of the rank and retrieve method | |
| #### Enhanced relevance through semantic processing | |
| Unlike traditional keyword-based search methods, the rank and retrieve approach uses pre-trained deep learning models to understand the context and semantics of texts. | |
| This ensures that the results are not just syntactically but also semantically aligned with the user’s query, thereby increasing the relevance and utility of the retrieved documents. | |
| #### Efficiency and scalability | |
| By pre-computing embeddings and storing them in a FAISS index, the application can quickly retrieve and rank documents without the need for on-the-fly computation. | |
| This makes the system highly efficient and scalable, capable of handling large datasets and high query volumes with minimal latency. | |
| #### Avoidance of training on sensitive data | |
| One significant advantage of this approach over traditional text classification models is that it does not require training on sensitive or personally identifiable information (PII). | |
| Since the model operates solely on public domain occupational descriptions from ESCO, there is no need to train a text classification model and hence no risk of exposing personal data. | |
| An important factor given the regulations around data privacy (such as GDPR in Europe) and the ethical considerations of working with PII. | |
| #### Adaptability and Multilingual Capability | |
| The BAAI/BGE-m3 model's multilingual capabilities mean that the application can function effectively across different languages without the need for separate models or extensive retraining. | |
| This adaptability makes it suitable for global deployment, particularly in diverse linguistic and cultural contexts. | |
| ### Conclusion | |
| The rank and retrieve application showcases an advanced use of langauge models in information retrieval, offering a practical, efficient, and privacy-respecting solution for matching job titles (and/or descriptions) with occupational standards like ISCO-08. | |
| """ | |
| ) | |
| demo.launch() | |