from sentence_transformers import SentenceTransformer import faiss import numpy as np import openai import pandas as pd import os from retry import retry # APIキーの設定 openai.api_key = os.environ["OPENAI_API_KEY"] class skillFinder: def __init__(self, skill_df): self.skill_df = skill_df self.ST_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2") self.source_skill_names = list(skill_df["source"].unique()) self.selected_skill_names = list(skill_df["selected_skill"].unique()) self.vectorization() def vectorization(self): source_skill_vectors = self.ST_model.encode(self.source_skill_names) source_skill_vectors = ( source_skill_vectors / np.linalg.norm(source_skill_vectors, axis=1)[:, None] ) self.source_index = faiss.IndexFlatIP(source_skill_vectors.shape[1]) self.source_index.add(source_skill_vectors) selected_skill_vectors = self.ST_model.encode(self.selected_skill_names) selected_skill_vectors = ( selected_skill_vectors / np.linalg.norm(selected_skill_vectors, axis=1)[:, None] ) self.selected_index = faiss.IndexFlatIP(selected_skill_vectors.shape[1]) self.selected_index.add(selected_skill_vectors) name_vectors = self.ST_model.encode(self.skill_df["name"].unique()) name_vectors = name_vectors / np.linalg.norm(name_vectors, axis=1)[:, None] self.name_index = faiss.IndexFlatIP(name_vectors.shape[1]) self.name_index.add(name_vectors) def find_similar_skill(self, skill_name, range_threshold=0.7): skill_vector = self.ST_model.encode([skill_name]) skill_vector = skill_vector / np.linalg.norm(skill_vector, axis=1)[:, None] _, D_source, I_source = self.source_index.range_search( skill_vector, range_threshold ) source_similar_skill_names = [self.source_skill_names[i] for i in I_source] source_skill_name_2_distance = { name: distance for name, distance in zip(source_similar_skill_names, D_source) } _, D_selected, I_selected = self.selected_index.range_search( skill_vector, range_threshold ) selected_similar_skill_names = [ self.selected_skill_names[i] for i in I_selected ] selected_skill_name_2_distance = { name: distance for name, distance in zip(selected_similar_skill_names, D_selected) } _, D_name, I_name = self.name_index.range_search(skill_vector, range_threshold) name_similar_skill_names = [self.skill_df["name"].unique()[i] for i in I_name] name_skill_name_2_distance = { name: distance for name, distance in zip(name_similar_skill_names, D_name) } similar_df = self.skill_df[ (self.skill_df["source"].isin(source_similar_skill_names)) | ( self.skill_df["selected_skill"].isin(selected_similar_skill_names) | (self.skill_df["name"].isin(name_similar_skill_names)) ) ] similar_df["source_distances"] = similar_df["source"].map( source_skill_name_2_distance ) similar_df["selected_distances"] = similar_df["selected_skill"].map( selected_skill_name_2_distance ) similar_df["name_distances"] = similar_df["name"].map( name_skill_name_2_distance ) similar_df["max_similarity"] = similar_df[ ["source_distances", "selected_distances", "name_distances"] ].max(axis=1) return similar_df.sort_values(by="max_similarity", ascending=False) @retry(tries = 3) def send_chatAPI(template: str, word: str): response = openai.ChatCompletion.create( model="gpt-3.5-turbo-1106", messages=[ {"role": "system", "content": template}, {"role": "user", "content": word}, ], temperature=0, request_timeout = 7.5, ) return response["choices"][0]["message"]["content"] def extract_required_skill(system_template, text): print(system_template) label = send_chatAPI(system_template, text) print(label) skills, levels = [], [] for line in label.split("\n"): if line != "": skill, level = line.split(",") skills.append(skill.split("- ")[1]) levels.append(int(level)) max_level = max(levels) min_level = min(levels) - 1 label = {skill: (level - min_level) / (max_level - min_level) for skill, level in zip(skills, levels)} # Normalize the levels using both max and min values print(label) return label, label def calculate_score(skill_finder, required_skill_dict): total_score_dict = {} for required_skill, score in required_skill_dict.items(): user_count = {} similar_df = skill_finder.find_similar_skill(required_skill) similar_df.drop_duplicates(subset=["name","display_name"], inplace=True) for user, name, max_similarity in zip(similar_df["display_name"], similar_df["name"], similar_df["max_similarity"]): user_count[user] = user_count.get(user, 0) + 1 if user_count[user] < 3: if user not in total_score_dict: total_score_dict[user] = {} if name not in total_score_dict[user]: total_score_dict[user][name] = 0 total_score_dict[user][name] += max_similarity * score res = [] for user, tag_score_dict in total_score_dict.items(): total_score = sum(tag_score_dict.values()) related_tags = '' for tag, score in sorted(tag_score_dict.items(), key=lambda item: item[1], reverse=True): related_tags += f'{tag}({score:.2f}), ' res.append({"user":user, "total_score":total_score, "related_tags":related_tags}) return pd.DataFrame(res).sort_values(by="total_score", ascending=False)