Spaces:
Build error
Build error
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import numpy as np | |
| import openai | |
| import pandas as pd | |
| import os | |
| from retry import retry | |
| # APIキーの設定 | |
| openai.api_key = os.environ["OPENAI_API_KEY"] | |
| class skillFinder: | |
| def __init__(self, skill_df): | |
| self.skill_df = skill_df | |
| self.ST_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") | |
| self.source_skill_names = list(skill_df["source"].unique()) | |
| self.selected_skill_names = list(skill_df["selected_skill"].unique()) | |
| self.vectorization() | |
| def vectorization(self): | |
| source_skill_vectors = self.ST_model.encode(self.source_skill_names) | |
| source_skill_vectors = ( | |
| source_skill_vectors / np.linalg.norm(source_skill_vectors, axis=1)[:, None] | |
| ) | |
| self.source_index = faiss.IndexFlatIP(source_skill_vectors.shape[1]) | |
| self.source_index.add(source_skill_vectors) | |
| selected_skill_vectors = self.ST_model.encode(self.selected_skill_names) | |
| selected_skill_vectors = ( | |
| selected_skill_vectors | |
| / np.linalg.norm(selected_skill_vectors, axis=1)[:, None] | |
| ) | |
| self.selected_index = faiss.IndexFlatIP(selected_skill_vectors.shape[1]) | |
| self.selected_index.add(selected_skill_vectors) | |
| name_vectors = self.ST_model.encode(self.skill_df["name"].unique()) | |
| name_vectors = name_vectors / np.linalg.norm(name_vectors, axis=1)[:, None] | |
| self.name_index = faiss.IndexFlatIP(name_vectors.shape[1]) | |
| self.name_index.add(name_vectors) | |
| def find_similar_skill(self, skill_name, range_threshold=0.7): | |
| skill_vector = self.ST_model.encode([skill_name]) | |
| skill_vector = skill_vector / np.linalg.norm(skill_vector, axis=1)[:, None] | |
| _, D_source, I_source = self.source_index.range_search( | |
| skill_vector, range_threshold | |
| ) | |
| source_similar_skill_names = [self.source_skill_names[i] for i in I_source] | |
| source_skill_name_2_distance = { | |
| name: distance | |
| for name, distance in zip(source_similar_skill_names, D_source) | |
| } | |
| _, D_selected, I_selected = self.selected_index.range_search( | |
| skill_vector, range_threshold | |
| ) | |
| selected_similar_skill_names = [ | |
| self.selected_skill_names[i] for i in I_selected | |
| ] | |
| selected_skill_name_2_distance = { | |
| name: distance | |
| for name, distance in zip(selected_similar_skill_names, D_selected) | |
| } | |
| _, D_name, I_name = self.name_index.range_search(skill_vector, range_threshold) | |
| name_similar_skill_names = [self.skill_df["name"].unique()[i] for i in I_name] | |
| name_skill_name_2_distance = { | |
| name: distance for name, distance in zip(name_similar_skill_names, D_name) | |
| } | |
| similar_df = self.skill_df[ | |
| (self.skill_df["source"].isin(source_similar_skill_names)) | |
| | ( | |
| self.skill_df["selected_skill"].isin(selected_similar_skill_names) | |
| | (self.skill_df["name"].isin(name_similar_skill_names)) | |
| ) | |
| ] | |
| similar_df["source_distances"] = similar_df["source"].map( | |
| source_skill_name_2_distance | |
| ) | |
| similar_df["selected_distances"] = similar_df["selected_skill"].map( | |
| selected_skill_name_2_distance | |
| ) | |
| similar_df["name_distances"] = similar_df["name"].map( | |
| name_skill_name_2_distance | |
| ) | |
| similar_df["max_similarity"] = similar_df[ | |
| ["source_distances", "selected_distances", "name_distances"] | |
| ].max(axis=1) | |
| return similar_df.sort_values(by="max_similarity", ascending=False) | |
| def send_chatAPI(template: str, word: str): | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo-1106", | |
| messages=[ | |
| {"role": "system", "content": template}, | |
| {"role": "user", "content": word}, | |
| ], | |
| temperature=0, | |
| request_timeout = 7.5, | |
| ) | |
| return response["choices"][0]["message"]["content"] | |
| def extract_required_skill(system_template, text): | |
| print(system_template) | |
| label = send_chatAPI(system_template, text) | |
| print(label) | |
| skills, levels = [], [] | |
| for line in label.split("\n"): | |
| if line != "": | |
| skill, level = line.split(",") | |
| skills.append(skill.split("- ")[1]) | |
| levels.append(int(level)) | |
| max_level = max(levels) | |
| min_level = min(levels) - 1 | |
| label = {skill: (level - min_level) / (max_level - min_level) for skill, level in zip(skills, levels)} # Normalize the levels using both max and min values | |
| print(label) | |
| return label, label | |
| def calculate_score(skill_finder, required_skill_dict): | |
| total_score_dict = {} | |
| for required_skill, score in required_skill_dict.items(): | |
| user_count = {} | |
| similar_df = skill_finder.find_similar_skill(required_skill) | |
| similar_df.drop_duplicates(subset=["name","display_name"], inplace=True) | |
| for user, name, max_similarity in zip(similar_df["display_name"], similar_df["name"], similar_df["max_similarity"]): | |
| user_count[user] = user_count.get(user, 0) + 1 | |
| if user_count[user] < 3: | |
| if user not in total_score_dict: | |
| total_score_dict[user] = {} | |
| if name not in total_score_dict[user]: | |
| total_score_dict[user][name] = 0 | |
| total_score_dict[user][name] += max_similarity * score | |
| res = [] | |
| for user, tag_score_dict in total_score_dict.items(): | |
| total_score = sum(tag_score_dict.values()) | |
| related_tags = '' | |
| for tag, score in sorted(tag_score_dict.items(), key=lambda item: item[1], reverse=True): | |
| related_tags += f'{tag}({score:.2f}), ' | |
| res.append({"user":user, "total_score":total_score, "related_tags":related_tags}) | |
| return pd.DataFrame(res).sort_values(by="total_score", ascending=False) |