File size: 6,030 Bytes
23a5255
 
 
 
 
dc97795
5539dd6
7622cba
 
23a5255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5539dd6
23a5255
 
 
 
 
 
 
 
5539dd6
23a5255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import openai
import pandas as pd
import os
from retry import retry
# APIキーの設定
openai.api_key = os.environ["OPENAI_API_KEY"]

class skillFinder:
    def __init__(self, skill_df):
        self.skill_df = skill_df
        self.ST_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
        self.source_skill_names = list(skill_df["source"].unique())
        self.selected_skill_names = list(skill_df["selected_skill"].unique())
        self.vectorization()

    def vectorization(self):
        source_skill_vectors = self.ST_model.encode(self.source_skill_names)
        source_skill_vectors = (
            source_skill_vectors / np.linalg.norm(source_skill_vectors, axis=1)[:, None]
        )
        self.source_index = faiss.IndexFlatIP(source_skill_vectors.shape[1])
        self.source_index.add(source_skill_vectors)
        selected_skill_vectors = self.ST_model.encode(self.selected_skill_names)
        selected_skill_vectors = (
            selected_skill_vectors
            / np.linalg.norm(selected_skill_vectors, axis=1)[:, None]
        )
        self.selected_index = faiss.IndexFlatIP(selected_skill_vectors.shape[1])
        self.selected_index.add(selected_skill_vectors)
        name_vectors = self.ST_model.encode(self.skill_df["name"].unique())
        name_vectors = name_vectors / np.linalg.norm(name_vectors, axis=1)[:, None]
        self.name_index = faiss.IndexFlatIP(name_vectors.shape[1])
        self.name_index.add(name_vectors)

    def find_similar_skill(self, skill_name, range_threshold=0.7):
        skill_vector = self.ST_model.encode([skill_name])
        skill_vector = skill_vector / np.linalg.norm(skill_vector, axis=1)[:, None]

        _, D_source, I_source = self.source_index.range_search(
            skill_vector, range_threshold
        )
        source_similar_skill_names = [self.source_skill_names[i] for i in I_source]
        source_skill_name_2_distance = {
            name: distance
            for name, distance in zip(source_similar_skill_names, D_source)
        }

        _, D_selected, I_selected = self.selected_index.range_search(
            skill_vector, range_threshold
        )
        selected_similar_skill_names = [
            self.selected_skill_names[i] for i in I_selected
        ]
        selected_skill_name_2_distance = {
            name: distance
            for name, distance in zip(selected_similar_skill_names, D_selected)
        }

        _, D_name, I_name = self.name_index.range_search(skill_vector, range_threshold)
        name_similar_skill_names = [self.skill_df["name"].unique()[i] for i in I_name]
        name_skill_name_2_distance = {
            name: distance for name, distance in zip(name_similar_skill_names, D_name)
        }

        similar_df = self.skill_df[
            (self.skill_df["source"].isin(source_similar_skill_names))
            | (
                self.skill_df["selected_skill"].isin(selected_similar_skill_names)
                | (self.skill_df["name"].isin(name_similar_skill_names))
            )
        ]

        similar_df["source_distances"] = similar_df["source"].map(
            source_skill_name_2_distance
        )
        similar_df["selected_distances"] = similar_df["selected_skill"].map(
            selected_skill_name_2_distance
        )
        similar_df["name_distances"] = similar_df["name"].map(
            name_skill_name_2_distance
        )
        similar_df["max_similarity"] = similar_df[
            ["source_distances", "selected_distances", "name_distances"]
        ].max(axis=1)
        return similar_df.sort_values(by="max_similarity", ascending=False)

@retry(tries = 3)
def send_chatAPI(template: str, word: str):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": template},
            {"role": "user", "content": word},
        ],
        temperature=0,
        request_timeout = 7.5,
    )
    return response["choices"][0]["message"]["content"]


def extract_required_skill(system_template, text):
    print(system_template)
    label = send_chatAPI(system_template, text)
    print(label)
    skills, levels = [], []

    for line in label.split("\n"):
        if line != "":
            skill, level = line.split(",")
            skills.append(skill.split("- ")[1])
            levels.append(int(level))
    max_level = max(levels)
    min_level = min(levels) - 1
    label = {skill: (level - min_level) / (max_level - min_level) for skill, level in zip(skills, levels)}  # Normalize the levels using both max and min values
    print(label)
    return label, label

def calculate_score(skill_finder, required_skill_dict):
    total_score_dict = {}
    for required_skill, score in required_skill_dict.items():
        user_count = {}
        similar_df = skill_finder.find_similar_skill(required_skill)
        similar_df.drop_duplicates(subset=["name","display_name"], inplace=True)
        for user, name, max_similarity in zip(similar_df["display_name"], similar_df["name"], similar_df["max_similarity"]):
            user_count[user] = user_count.get(user, 0) + 1
            if user_count[user] < 3:
                if user not in total_score_dict:
                    total_score_dict[user] = {}
                if name not in total_score_dict[user]:
                    total_score_dict[user][name] = 0
                total_score_dict[user][name] += max_similarity * score
    
    res = []
    for user, tag_score_dict in total_score_dict.items():
        total_score = sum(tag_score_dict.values())
        related_tags = ''
        for tag, score in sorted(tag_score_dict.items(), key=lambda item: item[1], reverse=True):
            related_tags += f'{tag}({score:.2f}), '
        res.append({"user":user, "total_score":total_score, "related_tags":related_tags})

    return pd.DataFrame(res).sort_values(by="total_score", ascending=False)