Spaces:
Build error
Build error
File size: 6,030 Bytes
23a5255 dc97795 5539dd6 7622cba 23a5255 5539dd6 23a5255 5539dd6 23a5255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
import pandas as pd
import os
from retry import retry
# APIキーの設定
openai.api_key = os.environ["OPENAI_API_KEY"]
class skillFinder:
def __init__(self, skill_df):
self.skill_df = skill_df
self.ST_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
self.source_skill_names = list(skill_df["source"].unique())
self.selected_skill_names = list(skill_df["selected_skill"].unique())
self.vectorization()
def vectorization(self):
source_skill_vectors = self.ST_model.encode(self.source_skill_names)
source_skill_vectors = (
source_skill_vectors / np.linalg.norm(source_skill_vectors, axis=1)[:, None]
)
self.source_index = faiss.IndexFlatIP(source_skill_vectors.shape[1])
self.source_index.add(source_skill_vectors)
selected_skill_vectors = self.ST_model.encode(self.selected_skill_names)
selected_skill_vectors = (
selected_skill_vectors
/ np.linalg.norm(selected_skill_vectors, axis=1)[:, None]
)
self.selected_index = faiss.IndexFlatIP(selected_skill_vectors.shape[1])
self.selected_index.add(selected_skill_vectors)
name_vectors = self.ST_model.encode(self.skill_df["name"].unique())
name_vectors = name_vectors / np.linalg.norm(name_vectors, axis=1)[:, None]
self.name_index = faiss.IndexFlatIP(name_vectors.shape[1])
self.name_index.add(name_vectors)
def find_similar_skill(self, skill_name, range_threshold=0.7):
skill_vector = self.ST_model.encode([skill_name])
skill_vector = skill_vector / np.linalg.norm(skill_vector, axis=1)[:, None]
_, D_source, I_source = self.source_index.range_search(
skill_vector, range_threshold
)
source_similar_skill_names = [self.source_skill_names[i] for i in I_source]
source_skill_name_2_distance = {
name: distance
for name, distance in zip(source_similar_skill_names, D_source)
}
_, D_selected, I_selected = self.selected_index.range_search(
skill_vector, range_threshold
)
selected_similar_skill_names = [
self.selected_skill_names[i] for i in I_selected
]
selected_skill_name_2_distance = {
name: distance
for name, distance in zip(selected_similar_skill_names, D_selected)
}
_, D_name, I_name = self.name_index.range_search(skill_vector, range_threshold)
name_similar_skill_names = [self.skill_df["name"].unique()[i] for i in I_name]
name_skill_name_2_distance = {
name: distance for name, distance in zip(name_similar_skill_names, D_name)
}
similar_df = self.skill_df[
(self.skill_df["source"].isin(source_similar_skill_names))
| (
self.skill_df["selected_skill"].isin(selected_similar_skill_names)
| (self.skill_df["name"].isin(name_similar_skill_names))
)
]
similar_df["source_distances"] = similar_df["source"].map(
source_skill_name_2_distance
)
similar_df["selected_distances"] = similar_df["selected_skill"].map(
selected_skill_name_2_distance
)
similar_df["name_distances"] = similar_df["name"].map(
name_skill_name_2_distance
)
similar_df["max_similarity"] = similar_df[
["source_distances", "selected_distances", "name_distances"]
].max(axis=1)
return similar_df.sort_values(by="max_similarity", ascending=False)
@retry(tries = 3)
def send_chatAPI(template: str, word: str):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-1106",
messages=[
{"role": "system", "content": template},
{"role": "user", "content": word},
],
temperature=0,
request_timeout = 7.5,
)
return response["choices"][0]["message"]["content"]
def extract_required_skill(system_template, text):
print(system_template)
label = send_chatAPI(system_template, text)
print(label)
skills, levels = [], []
for line in label.split("\n"):
if line != "":
skill, level = line.split(",")
skills.append(skill.split("- ")[1])
levels.append(int(level))
max_level = max(levels)
min_level = min(levels) - 1
label = {skill: (level - min_level) / (max_level - min_level) for skill, level in zip(skills, levels)} # Normalize the levels using both max and min values
print(label)
return label, label
def calculate_score(skill_finder, required_skill_dict):
total_score_dict = {}
for required_skill, score in required_skill_dict.items():
user_count = {}
similar_df = skill_finder.find_similar_skill(required_skill)
similar_df.drop_duplicates(subset=["name","display_name"], inplace=True)
for user, name, max_similarity in zip(similar_df["display_name"], similar_df["name"], similar_df["max_similarity"]):
user_count[user] = user_count.get(user, 0) + 1
if user_count[user] < 3:
if user not in total_score_dict:
total_score_dict[user] = {}
if name not in total_score_dict[user]:
total_score_dict[user][name] = 0
total_score_dict[user][name] += max_similarity * score
res = []
for user, tag_score_dict in total_score_dict.items():
total_score = sum(tag_score_dict.values())
related_tags = ''
for tag, score in sorted(tag_score_dict.items(), key=lambda item: item[1], reverse=True):
related_tags += f'{tag}({score:.2f}), '
res.append({"user":user, "total_score":total_score, "related_tags":related_tags})
return pd.DataFrame(res).sort_values(by="total_score", ascending=False) |