m00913563 commited on
Commit
b85afa8
·
1 Parent(s): 3755055

feat: add evaluator

Browse files
Files changed (3) hide show
  1. app.py +7 -2
  2. evaluator.py +256 -0
  3. models.py +8 -1
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from fastapi import FastAPI, HTTPException
2
- from models import CVExtracted, InsertedText, JobAndCV, ClassificationResult, InsertedLink
3
  import os
4
  from io import BytesIO
5
  # import extractor
@@ -8,6 +8,7 @@ from datetime import datetime
8
  from pypdf import PdfReader
9
  import requests
10
  import classificator
 
11
 
12
  os.environ['TRANSFORMERS_CACHE'] = '/transformers_cache'
13
  os.environ['HF_HOME'] = '/transformers_cache'
@@ -86,4 +87,8 @@ async def extract(link: InsertedLink):
86
  raise HTTPException(status_code=response.status_code, detail="File server error")
87
 
88
  dictresult = extractor_llm.predict(text)
89
- return dictresult
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
+ from models import CVExtracted, EvaModul, JobAndCV, ClassificationResult, InsertedLink
3
  import os
4
  from io import BytesIO
5
  # import extractor
 
8
  from pypdf import PdfReader
9
  import requests
10
  import classificator
11
+ import evaluator
12
 
13
  os.environ['TRANSFORMERS_CACHE'] = '/transformers_cache'
14
  os.environ['HF_HOME'] = '/transformers_cache'
 
87
  raise HTTPException(status_code=response.status_code, detail="File server error")
88
 
89
  dictresult = extractor_llm.predict(text)
90
+ return dictresult
91
+
92
+ @app.post("/eval", response_model=float)
93
+ async def eval(eva: EvaModul):
94
+ return evaluator.evaluate_interview(competences=eva.competences, transcript=eva.transcript)
evaluator.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from models import Evaluation
3
+ tags = {'AI': "This one is the competence description"} #list of competence to save, better to hit db.
4
+ client = OpenAI()
5
+
6
+ def generate_model_parameters(skill: str, transcript: str):
7
+ model_parameters = {
8
+ "model":"gpt-4-0125-preview",
9
+ "messages":[
10
+ {"role": "system", "content": f"""
11
+ You are tasked with evaluating a transcript of an IT job interview. The interview that is conducted in the transcript is technical.
12
+ You need sufficient IT knowledge since you will evaluate the answer of the interviewee to determine whether the interviewee answer correctly or not.
13
+ You will output "SUCCESS" if the interviewee's answer is deemed correct and "FAIL" if it's deemed false.
14
+ Below are 5 examples of correct answers.
15
+
16
+ Here are 5 examples:
17
+ EXAMPLE 1:
18
+ SKILL TO BE EVALUATED: Python
19
+
20
+ INTERVIEWER:
21
+ What is the use of zip () in python?
22
+
23
+ INTERVIEWEE:
24
+ The zip returns an iterator and takes iterable as argument. These iterables can be list, tuple, dictionary etc. It maps similar index of every iterable to make a single entity.
25
+
26
+ OUTPUT: SUCCESS
27
+
28
+ EXAMPLE 2:
29
+ SKILL TO BE EVALUATED: Python
30
+
31
+ INTERVIEWER:
32
+ What will be the output of the following?
33
+ name=["swati","shweta"]
34
+ age=[10,20]
35
+ new_entity-zip(name,age)
36
+ new_entity-set(new_entity)
37
+ print(new_entity)
38
+
39
+ INTERVIEWEE:
40
+ The output is {{('shweta', 20), ('swati', 10)}}
41
+
42
+ OUTPUT: SUCCESS
43
+
44
+ EXAMPLE 3:
45
+ SKILL TO BE EVALUATED: Python
46
+
47
+ INTERVIEWER:
48
+ What will be the output of the following?
49
+ a=["1","2","3"]
50
+ b=["a","b","c"]
51
+ c=[x+y for x, y in zip(a,b)] print(c)
52
+
53
+ INTERVIEWEE:
54
+ The output is: ['1a', '2b', '3c']
55
+
56
+ OUTPUT: SUCCESS
57
+
58
+ EXAMPLE 4:
59
+ SKILL TO BE EVALUATED: Python
60
+
61
+ INTERVIEWER:
62
+ What will be the output of the following?
63
+ str="apple#banana#kiwi#orange"
64
+ print(str.split("#",2))
65
+
66
+ INTERVIEWEE:
67
+ ['apple', 'banana', 'kiwi#orange']
68
+
69
+ OUTPUT: SUCCESS
70
+
71
+ EXAMPLE 5:
72
+ SKILL TO BE EVALUATED: Python
73
+
74
+ INTERVIEWER:
75
+ What are python modules? Name some commonly used built-in modules in Python?
76
+
77
+ INTERVIEWEE:
78
+ Python modules are files containing Python code. This code can either be function classes or variables. A Python module is a .py file containing executable code. Some of the commonly used built-in modules are:
79
+ - os
80
+ - sys
81
+ - math
82
+ - random
83
+ - data time
84
+ - json
85
+
86
+ OUTPUT: SUCCESS
87
+
88
+ Note that the examples that I give above have the correct answer. Your job is to generate the output only (SUCCESS OR FAIL). You don't need to explain your justification.
89
+ SKILL TO BE EVALUATED: {skill}
90
+ {transcript}
91
+
92
+ """},
93
+ ]
94
+ }
95
+
96
+ return model_parameters
97
+
98
+ def gpt_evaluator(payload, fewshot, response_format):
99
+ response = client.beta.chat.completions.parse(
100
+ model="gpt-4o",
101
+ messages=[
102
+ {"role": "system", "content": fewshot},
103
+ {"role": "user", "content": input},
104
+ ],
105
+ response_format=response_format,
106
+ )
107
+ return response.choices[0].message.parsed
108
+
109
+ def extract_competences_and_responses(competences: list[str], transcripts: list[dict]):
110
+ responses = []
111
+
112
+ for i in range(len(competences)):
113
+ transcript = transcripts[i]
114
+
115
+ response = ""
116
+ for idx, chat in enumerate(transcript):
117
+ # logger.info(chat)
118
+ response += chat["answer"]
119
+
120
+ if idx < len(transcript) - 1:
121
+ response += "\n"
122
+
123
+ responses.append(response)
124
+
125
+ return responses
126
+
127
+ def evaluate_interview(competences: list[str], transcript: list):
128
+ global tags
129
+ model_inputs = []
130
+
131
+ responses = extract_competences_and_responses(competences, transcript["behavioral"])
132
+
133
+ print(len(competences))
134
+ print(len(responses))
135
+
136
+ # pprint(transcript)
137
+
138
+ for i in range(len(competences)):
139
+ competence = competences[i]
140
+ response = responses[i]
141
+
142
+ text = "KNOWLEDGE:\n"
143
+
144
+ matching_tags_text_competence = {tag for tag in tags if tag in competence}
145
+ matching_tags_text_response = {tag for tag in tags if tag in response}
146
+
147
+ matching_tags = matching_tags_text_competence.union(matching_tags_text_response)
148
+
149
+ knowledge_exist = False
150
+ for tag in matching_tags:
151
+ knowledge_text = tags[tag]
152
+ if "UNKNOWN TAG" not in knowledge_text:
153
+ text += knowledge_text
154
+ text += "\n"
155
+ knowledge_exist = True
156
+
157
+ if not knowledge_exist:
158
+ text +="None\n"
159
+
160
+ text += f"\nCOMPETENCE: {competence}\n\n"
161
+
162
+ text += f"RESPONSE:\n{response}"
163
+
164
+ model_inputs.append(text)
165
+ # print(text)
166
+ print("------")
167
+ ## TODO: change to gpt
168
+ result = gpt_evaluator(model_inputs,
169
+ """
170
+ Here are 5 examples:
171
+ EXAMPLE 1:
172
+ SKILL TO BE EVALUATED: Honest
173
+
174
+ INTERVIEWER:
175
+ What are your nightmare?
176
+
177
+ INTERVIEWEE:
178
+ I Do not have night mare
179
+ OUTPUT: FAIL
180
+
181
+ Always send output in format "FAIL" or "SUCCESS"
182
+ """,
183
+ [Evaluation]
184
+ )
185
+ ## output:
186
+ final_score = 0
187
+ behavioral_scores = generate_behavioral_score(result)
188
+ technical_scores = generate_technical_score(competences, transcript["technical"])
189
+
190
+ final_score = aggregate_scores(behavioral_scores, technical_scores)
191
+
192
+ return final_score
193
+
194
+ def aggregate_scores(b: list[int], t: list[int]):
195
+ total_score = 0
196
+
197
+ for i in range(len(b)):
198
+ score = 0
199
+ if t[i] != -1:
200
+ score = (b[i] + t[i]) / 2
201
+
202
+ else:
203
+ score = b[i]
204
+
205
+ total_score += score
206
+
207
+
208
+ return (total_score / len(b)) * 100
209
+
210
+
211
+ def generate_behavioral_score(eval_array):
212
+ scores = []
213
+
214
+ for eval in eval_array:
215
+ fail_score = 0
216
+ success_score = 0
217
+
218
+ if eval[0]["label"] == "FAIL":
219
+ fail_score = eval[0]["score"]
220
+ elif eval[0]["label"] == "SUCCESS":
221
+ success_score = eval[0]["score"]
222
+
223
+ if eval[1]["label"] == "FAIL":
224
+ fail_score = eval[1]["score"]
225
+ elif eval[1]["label"] == "SUCCESS":
226
+ success_score = eval[1]["score"]
227
+
228
+ if fail_score < success_score:
229
+ scores.append(1)
230
+ else:
231
+ scores.append(0)
232
+
233
+ return scores
234
+
235
+ def generate_technical_score(skills: str, transcript: str):
236
+ # total_score = 0
237
+ scores = []
238
+ for idx, skill in enumerate(skills):
239
+ chat = transcript[idx]
240
+ if len(chat) > 0:
241
+ # print(chat)
242
+ transcript_text = f"INTERVIEWEE:\n{chat[0]['question'].lstrip('TECHNICAL: ')}\n\nINTERVIEWER:\n{chat[0]['answer']}"
243
+ # TODO: change to structured output
244
+ model_parameters = generate_model_parameters(skill, transcript_text)
245
+ completion = client.chat.completions.create(
246
+ **model_parameters
247
+ )
248
+
249
+ generated = completion.choices[0].message.content
250
+ score = 1 if "SUCCESS" in generated else 0
251
+ # total_score += score
252
+ scores.append(score)
253
+ else:
254
+ scores.append(-1)
255
+
256
+ return scores
models.py CHANGED
@@ -58,4 +58,11 @@ class ClassificationResult(BaseModel):
58
  score: float
59
 
60
  class InsertedLink(BaseModel):
61
- link: str
 
 
 
 
 
 
 
 
58
  score: float
59
 
60
  class InsertedLink(BaseModel):
61
+ link: str
62
+
63
+ class Evaluation(BaseModel):
64
+ label: str
65
+
66
+ class EvaModul(BaseModel):
67
+ competences: list[str]
68
+ transcript: dict