skill_maching_msols / function.py
Yongtae's picture
Update function.py
5539dd6
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
import pandas as pd
import os
from retry import retry
# APIキーの設定
openai.api_key = os.environ["OPENAI_API_KEY"]
class skillFinder:
def __init__(self, skill_df):
self.skill_df = skill_df
self.ST_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
self.source_skill_names = list(skill_df["source"].unique())
self.selected_skill_names = list(skill_df["selected_skill"].unique())
self.vectorization()
def vectorization(self):
source_skill_vectors = self.ST_model.encode(self.source_skill_names)
source_skill_vectors = (
source_skill_vectors / np.linalg.norm(source_skill_vectors, axis=1)[:, None]
)
self.source_index = faiss.IndexFlatIP(source_skill_vectors.shape[1])
self.source_index.add(source_skill_vectors)
selected_skill_vectors = self.ST_model.encode(self.selected_skill_names)
selected_skill_vectors = (
selected_skill_vectors
/ np.linalg.norm(selected_skill_vectors, axis=1)[:, None]
)
self.selected_index = faiss.IndexFlatIP(selected_skill_vectors.shape[1])
self.selected_index.add(selected_skill_vectors)
name_vectors = self.ST_model.encode(self.skill_df["name"].unique())
name_vectors = name_vectors / np.linalg.norm(name_vectors, axis=1)[:, None]
self.name_index = faiss.IndexFlatIP(name_vectors.shape[1])
self.name_index.add(name_vectors)
def find_similar_skill(self, skill_name, range_threshold=0.7):
skill_vector = self.ST_model.encode([skill_name])
skill_vector = skill_vector / np.linalg.norm(skill_vector, axis=1)[:, None]
_, D_source, I_source = self.source_index.range_search(
skill_vector, range_threshold
)
source_similar_skill_names = [self.source_skill_names[i] for i in I_source]
source_skill_name_2_distance = {
name: distance
for name, distance in zip(source_similar_skill_names, D_source)
}
_, D_selected, I_selected = self.selected_index.range_search(
skill_vector, range_threshold
)
selected_similar_skill_names = [
self.selected_skill_names[i] for i in I_selected
]
selected_skill_name_2_distance = {
name: distance
for name, distance in zip(selected_similar_skill_names, D_selected)
}
_, D_name, I_name = self.name_index.range_search(skill_vector, range_threshold)
name_similar_skill_names = [self.skill_df["name"].unique()[i] for i in I_name]
name_skill_name_2_distance = {
name: distance for name, distance in zip(name_similar_skill_names, D_name)
}
similar_df = self.skill_df[
(self.skill_df["source"].isin(source_similar_skill_names))
| (
self.skill_df["selected_skill"].isin(selected_similar_skill_names)
| (self.skill_df["name"].isin(name_similar_skill_names))
)
]
similar_df["source_distances"] = similar_df["source"].map(
source_skill_name_2_distance
)
similar_df["selected_distances"] = similar_df["selected_skill"].map(
selected_skill_name_2_distance
)
similar_df["name_distances"] = similar_df["name"].map(
name_skill_name_2_distance
)
similar_df["max_similarity"] = similar_df[
["source_distances", "selected_distances", "name_distances"]
].max(axis=1)
return similar_df.sort_values(by="max_similarity", ascending=False)
@retry(tries = 3)
def send_chatAPI(template: str, word: str):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-1106",
messages=[
{"role": "system", "content": template},
{"role": "user", "content": word},
],
temperature=0,
request_timeout = 7.5,
)
return response["choices"][0]["message"]["content"]
def extract_required_skill(system_template, text):
print(system_template)
label = send_chatAPI(system_template, text)
print(label)
skills, levels = [], []
for line in label.split("\n"):
if line != "":
skill, level = line.split(",")
skills.append(skill.split("- ")[1])
levels.append(int(level))
max_level = max(levels)
min_level = min(levels) - 1
label = {skill: (level - min_level) / (max_level - min_level) for skill, level in zip(skills, levels)} # Normalize the levels using both max and min values
print(label)
return label, label
def calculate_score(skill_finder, required_skill_dict):
total_score_dict = {}
for required_skill, score in required_skill_dict.items():
user_count = {}
similar_df = skill_finder.find_similar_skill(required_skill)
similar_df.drop_duplicates(subset=["name","display_name"], inplace=True)
for user, name, max_similarity in zip(similar_df["display_name"], similar_df["name"], similar_df["max_similarity"]):
user_count[user] = user_count.get(user, 0) + 1
if user_count[user] < 3:
if user not in total_score_dict:
total_score_dict[user] = {}
if name not in total_score_dict[user]:
total_score_dict[user][name] = 0
total_score_dict[user][name] += max_similarity * score
res = []
for user, tag_score_dict in total_score_dict.items():
total_score = sum(tag_score_dict.values())
related_tags = ''
for tag, score in sorted(tag_score_dict.items(), key=lambda item: item[1], reverse=True):
related_tags += f'{tag}({score:.2f}), '
res.append({"user":user, "total_score":total_score, "related_tags":related_tags})
return pd.DataFrame(res).sort_values(by="total_score", ascending=False)