Muhammad Risqi Firdaus commited on
Commit
4f49d90
·
1 Parent(s): 21af640
Files changed (7) hide show
  1. .gitignore +6 -0
  2. app.py +2 -0
  3. env.example +2 -0
  4. evaluator.py +81 -46
  5. requirements.txt +2 -1
  6. temp.py +0 -6
  7. temp_eval.py +250 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ venv
2
+ .env
3
+ .bash_history
4
+ .cache
5
+ __pycache__
6
+ transformers_cache
app.py CHANGED
@@ -90,6 +90,8 @@ async def extract(link: InsertedLink):
90
  dictresult = extractor_llm.predict(text)
91
  return dictresult
92
 
 
 
93
  @app.post("/eval", response_model=EvalResult)
94
  async def eval(eva: EvaModul):
95
  transcript = extractor_helper.extract_technical(eva.competences, eva.transcript)
 
90
  dictresult = extractor_llm.predict(text)
91
  return dictresult
92
 
93
+
94
+
95
  @app.post("/eval", response_model=EvalResult)
96
  async def eval(eva: EvaModul):
97
  transcript = extractor_helper.extract_technical(eva.competences, eva.transcript)
env.example ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ OPENAI_API_KEY=
2
+ HF_HOME=~/transformers_cache
evaluator.py CHANGED
@@ -195,15 +195,18 @@ def gpt_evaluator(payload, fewshot, response_format):
195
  print("-----tes")
196
  print(fewshot)
197
  print(payload)
198
- response = client.beta.chat.completions.parse(
199
- model="gpt-4o-2024-08-06",
200
- messages=[
201
- {"role": "system", "content": fewshot},
202
- {"role": "user", "content": str(payload)},
203
- ],
204
- response_format=response_format)
205
- json_str = response.choices[0].message.parsed
206
- return json_str
 
 
 
207
 
208
  def extract_competences_and_responses(competences: list[str], transcripts: list[dict]):
209
  responses = []
@@ -228,6 +231,10 @@ def evaluate_interview(competences: list[str], transcript: list, lang: str = 'en
228
  model_inputs = []
229
 
230
  responses = extract_competences_and_responses(transcript["comp_beha"], transcript["behavioral"])
 
 
 
 
231
  # pprint(transcript)
232
 
233
  for i in range(len(transcript["comp_beha"])):
@@ -236,74 +243,102 @@ def evaluate_interview(competences: list[str], transcript: list, lang: str = 'en
236
 
237
  text = "KNOWLEDGE:\n"
238
 
239
- # matching_tags_text_competence = {tag for tag in tags if tag in competence}
240
- # matching_tags_text_response = {tag for tag in tags if tag in response}
241
-
242
- # matching_tags = matching_tags_text_competence.union(matching_tags_text_response)
243
-
244
-
245
  knowledge_exist = False
246
- # for tag in matching_tags:
247
- # knowledge_text = tags[tag]
248
- # if "UNKNOWN TAG" not in knowledge_text:
249
- # text += knowledge_text
250
- # text += "\n"
251
- # knowledge_exist = True
252
-
253
- # if not knowledge_exist:
254
- # text +="None\n"
255
 
256
  text += f"\nCOMPETENCE: {competence}\n\n"
257
 
258
  text += f"RESPONSE:\n{response}"
259
 
260
  model_inputs.append(text)
261
- # print(text)
262
  print("------")
263
  ## TODO: change to gpt
264
 
265
- eng = """
266
- Here are 5 examples:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  EXAMPLE 1:
268
  SKILL TO BE EVALUATED: Honest
269
-
270
  INTERVIEWER:
271
  What are your nightmare?
272
-
273
  INTERVIEWEE:
274
  I do not have night mare
275
-
276
  Judgement: It is impossible to some not having any nightmare. Scary of something is common human feels.
277
  Score: 0.1
278
-
 
 
 
 
 
 
 
 
 
 
279
  RETURN IN FORMAT BELOW:
280
  {
281
  value: [{
282
  "Judgement": "It is impossible to some not having any nightmare. Scary of something is common human feels. Means he was lying",
283
  "score": 0.1
284
- }]
 
 
 
 
 
285
  }
286
  """
287
- idn = """
288
- {
289
- "value": [{
290
- "Judgement": "It is impossible for someone to have never had any nightmare. Fear is a common human experience, so the interviewee is likely not being truthful.",
291
- "score": 0.1
292
- }]
293
- }
294
- """
295
- result = gpt_evaluator(model_inputs,
296
- eng if lang == 'en' else idn,
297
- Evaluations
298
  )
299
  ## output:
300
  final_score = 0
301
- behavioral_scores = generate_behavioral_score(result.value)
302
- technical_scores = generate_technical_score(transcript["comp_tech"], transcript["technical"], lang)
303
 
304
  final_score = aggregate_scores(behavioral_scores, technical_scores)
305
 
306
- return EvalResult(final_score=final_score, details=result.value)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
 
308
  def aggregate_scores(b: list[int], t: list[int]):
309
  total_score = 0
 
195
  print("-----tes")
196
  print(fewshot)
197
  print(payload)
198
+ res = []
199
+ for i in payload:
200
+ response = client.beta.chat.completions.parse(
201
+ model="gpt-4o-2024-08-06",
202
+ messages=[
203
+ {"role": "system", "content": fewshot},
204
+ {"role": "user", "content": (i)},
205
+ ],
206
+ response_format=response_format)
207
+ json_str = response.choices[0].message.parsed
208
+ res.append(json_str.value)
209
+ return res
210
 
211
  def extract_competences_and_responses(competences: list[str], transcripts: list[dict]):
212
  responses = []
 
231
  model_inputs = []
232
 
233
  responses = extract_competences_and_responses(transcript["comp_beha"], transcript["behavioral"])
234
+
235
+ print(len(competences))
236
+ print(len(responses))
237
+
238
  # pprint(transcript)
239
 
240
  for i in range(len(transcript["comp_beha"])):
 
243
 
244
  text = "KNOWLEDGE:\n"
245
 
 
 
 
 
 
 
246
  knowledge_exist = False
 
 
 
 
 
 
 
 
 
247
 
248
  text += f"\nCOMPETENCE: {competence}\n\n"
249
 
250
  text += f"RESPONSE:\n{response}"
251
 
252
  model_inputs.append(text)
 
253
  print("------")
254
  ## TODO: change to gpt
255
 
256
+ idn = """
257
+ CONTOH 1:
258
+ KETERAMPILAN YANG DINILAI: Kejujuran
259
+ PEWAWANCARA:
260
+ Apa mimpi burukmu?
261
+ PESERTA WAWANCARA:
262
+ Saya tidak punya mimpi buruk.
263
+ Penilaian: Tidak mungkin seseorang tidak pernah mengalami mimpi buruk. Rasa takut terhadap sesuatu adalah hal yang umum dirasakan manusia.
264
+ Skor: 0.1
265
+
266
+ CONTOH 2:
267
+ PEWAWANCARA:
268
+ Bisakah Anda menceritakan saat Anda harus men-debug masalah yang sangat sulit di lingkungan produksi?
269
+ PESERTA WAWANCARA:
270
+ Di pekerjaan saya sebelumnya, kami menggunakan arsitektur berbasis mikroservis yang dideploy di Kubernetes. Suatu pagi, kami mulai menerima peringatan bahwa layanan autentikasi pengguna kami gagal secara intermiten, dan pengguna tidak bisa masuk.
271
+ Sebagai engineer yang sedang bertugas, tanggung jawab saya adalah segera mengidentifikasi akar permasalahan dan mengembalikan layanan ke fungsionalitas penuh tanpa memengaruhi layanan lain yang bergantung padanya.
272
+ Saya mulai dengan memeriksa log di Kibana dan melihat bahwa beberapa pod untuk layanan autentikasi terus-menerus restart. Saya lalu memeriksa metrik penggunaan resource di Prometheus dan melihat lonjakan memori sebelum setiap crash. Saya curiga terjadi memory leak akibat perubahan terbaru, jadi saya rollback ke image container sebelumnya untuk menstabilkan layanan.
273
+ Setelah stabil, saya menelusuri commit terbaru dan menemukan penggunaan session store in-memory baru yang tidak melepaskan sesi lama dengan benar. Saya menulis skrip analisis heap dump cepat, mengonfirmasi kebocoran memori tersebut, dan memperbaiki session store dengan cache LRU yang terbatas.
274
+ Perbaikannya dideploy di hari yang sama, dan masalah tidak pernah terjadi lagi. Laporan postmortem yang saya tulis juga mendorong tim untuk mengadopsi profiling memori untuk semua komponen layanan baru. Waktu penyelesaian insiden kami meningkat sekitar 30% di kuartal berikutnya berkat perbaikan proses tersebut.
275
+ """
276
+
277
+ en = """
278
+ Here are 2 examples:
279
  EXAMPLE 1:
280
  SKILL TO BE EVALUATED: Honest
 
281
  INTERVIEWER:
282
  What are your nightmare?
 
283
  INTERVIEWEE:
284
  I do not have night mare
 
285
  Judgement: It is impossible to some not having any nightmare. Scary of something is common human feels.
286
  Score: 0.1
287
+
288
+ EXAMPLE 2:
289
+ INTERVIEWER:
290
+ Can you tell me about a time you had to debug a particularly difficult issue in a production environment?
291
+ INTERVIEWEE:
292
+ At my previous job, we had a microservices-based architecture deployed on Kubernetes. One morning, we started getting alerts that our user authentication service was intermittently failing, and users couldn’t log in.
293
+ As the engineer on call, my responsibility was to quickly identify the root cause and restore the service to full functionality without affecting other dependent services.
294
+ I began by checking the logs in Kibana and noticed that some of the pods for the authentication service were repeatedly restarting. I then checked the resource usage metrics in Prometheus and saw a memory spike before each crash. I suspected a memory leak introduced by a recent change, so I rolled back to the previous container image to stabilize the service.
295
+ After stabilizing, I dug deeper into the recent commits and found a new in-memory session store that was not properly releasing old sessions. I wrote a quick heap dump analysis script, confirmed the leak, and patched the session store to use a bounded LRU cache instead.
296
+ The fix was deployed the same day, and the issue never recurred. The postmortem I wrote also led to the team adopting memory profiling for all new service components. Our incident resolution time improved by about 30% over the next quarter due to those process improvements.
297
+
298
  RETURN IN FORMAT BELOW:
299
  {
300
  value: [{
301
  "Judgement": "It is impossible to some not having any nightmare. Scary of something is common human feels. Means he was lying",
302
  "score": 0.1
303
+ },
304
+ {
305
+ "Judgement: "The candidate delivered a clear, concise STAR response that effectively demonstrated strong technical skills, composure under pressure, and a methodical approach to problem-solving in a production environment. The use of appropriate tools (Kibana, Prometheus), the decision to roll back, and the successful root cause analysis showed depth of experience. The result was measurable and impactful, indicating not just resolution but long-term improvement. Slightly more context on user or business impact would make it perfect, but overall, this is an excellent response that would strongly support a hiring decision."
306
+ "score": 0.95
307
+ }
308
+ ]
309
  }
310
  """
311
+ result = gpt_evaluator(model_inputs, en if lang == 'en' else idn,
312
+ Evaluations
 
 
 
 
 
 
 
 
 
313
  )
314
  ## output:
315
  final_score = 0
316
+ behavioral_scores = generate_behavioral_score(result)
317
+ technical_scores = generate_technical_score(transcript["comp_tech"], transcript["technical"])
318
 
319
  final_score = aggregate_scores(behavioral_scores, technical_scores)
320
 
321
+ return EvalResult(final_score=final_score, details=result)
322
+
323
+ def aggregate_scores(b: list[int], t: list[int]):
324
+ total_score = 0
325
+ alls = b + t
326
+ for i in range(len(alls)):
327
+ score = alls[i]
328
+ total_score += score
329
+
330
+
331
+ return (total_score / len(alls)) * 100
332
+
333
+
334
+ def generate_behavioral_score(eval_array):
335
+ print(eval_array)
336
+ scores = []
337
+
338
+ for eval in eval_array:
339
+ scores.append(eval.score)
340
+
341
+ return scores
342
 
343
  def aggregate_scores(b: list[int], t: list[int]):
344
  total_score = 0
requirements.txt CHANGED
@@ -8,4 +8,5 @@ scikit-learn
8
  numpy
9
  pandas
10
  openai
11
- geopy
 
 
8
  numpy
9
  pandas
10
  openai
11
+ geopy
12
+
temp.py DELETED
@@ -1,6 +0,0 @@
1
- from huggingface_hub import snapshot_download
2
- snapshot_download(
3
- repo_id="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
4
- cache_dir="/models/paraphrase-multilingual-mpnet-base-v2",
5
- local_files_only=False
6
- )
 
 
 
 
 
 
 
temp_eval.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ from models import Evaluations,EvalResult
3
+ from typing import List, Dict
4
+ import json
5
+ tags = {'AI': "This one is the competence description"} #list of competence to save, better to hit db.
6
+ client = OpenAI()
7
+
8
+ def generate_model_parameters(skill: str, transcript: str):
9
+ model_parameters = {
10
+ "model":"gpt-4-0125-preview",
11
+ "messages":[
12
+ {"role": "system", "content": f"""
13
+ You are tasked with evaluating a transcript of an IT job interview. The interview that is conducted in the transcript is technical.
14
+ You need sufficient IT knowledge since you will evaluate the answer of the interviewee to determine whether the interviewee answer correctly or not.
15
+ You will output "SUCCESS" if the interviewee's answer is deemed correct and "FAIL" if it's deemed false.
16
+ Below are 5 examples of correct answers.
17
+
18
+ Here are 5 examples:
19
+ EXAMPLE 1:
20
+ SKILL TO BE EVALUATED: Python
21
+ INTERVIEWER:
22
+ What is the use of zip () in python?
23
+ INTERVIEWEE:
24
+ The zip returns an iterator and takes iterable as argument. These iterables can be list, tuple, dictionary etc. It maps similar index of every iterable to make a single entity.
25
+
26
+ OUTPUT: SUCCESS
27
+ EXAMPLE 2:
28
+ SKILL TO BE EVALUATED: Python
29
+ INTERVIEWER:
30
+ What will be the output of the following?
31
+ name=["swati","shweta"]
32
+ age=[10,20]
33
+ new_entity-zip(name,age)
34
+ new_entity-set(new_entity)
35
+ print(new_entity)
36
+ INTERVIEWEE:
37
+ The output is {{('shweta', 20), ('swati', 10)}}
38
+ OUTPUT: SUCCESS
39
+ EXAMPLE 3:
40
+ SKILL TO BE EVALUATED: Python
41
+ INTERVIEWER:
42
+ What will be the output of the following?
43
+ a=["1","2","3"]
44
+ b=["a","b","c"]
45
+ c=[x+y for x, y in zip(a,b)] print(c)
46
+ INTERVIEWEE:
47
+ The output is: ['1a', '2b', '3c']
48
+ OUTPUT: SUCCESS
49
+ EXAMPLE 4:
50
+ SKILL TO BE EVALUATED: Python
51
+ INTERVIEWER:
52
+ What will be the output of the following?
53
+ str="apple#banana#kiwi#orange"
54
+ print(str.split("#",2))
55
+ INTERVIEWEE:
56
+ ['apple', 'banana', 'kiwi#orange']
57
+ OUTPUT: SUCCESS
58
+ EXAMPLE 5:
59
+ SKILL TO BE EVALUATED: Python
60
+ INTERVIEWER:
61
+ What are python modules? Name some commonly used built-in modules in Python?
62
+ INTERVIEWEE:
63
+ Python modules are files containing Python code. This code can either be function classes or variables. A Python module is a .py file containing executable code. Some of the commonly used built-in modules are:
64
+ - os
65
+ - sys
66
+ - math
67
+ - random
68
+ - data time
69
+ - json
70
+ OUTPUT: SUCCESS
71
+ Note that the examples that I give above have the correct answer. Your job is to generate the output only (SUCCESS OR FAIL). You don't need to explain your justification.
72
+ SKILL TO BE EVALUATED: {skill}
73
+ {transcript}
74
+ """},
75
+ ]
76
+ }
77
+
78
+ return model_parameters
79
+
80
+ def gpt_evaluator(payload, fewshot, response_format):
81
+ print("-----tes")
82
+ print(fewshot)
83
+ print(payload)
84
+ res = []
85
+ for i in payload:
86
+ response = client.beta.chat.completions.parse(
87
+ model="gpt-4o-2024-08-06",
88
+ messages=[
89
+ {"role": "system", "content": fewshot},
90
+ {"role": "user", "content": (i)},
91
+ ],
92
+ response_format=response_format)
93
+ json_str = response.choices[0].message.parsed
94
+ res.append(json_str.value)
95
+ return res
96
+
97
+ def extract_competences_and_responses(competences: list[str], transcripts: list[dict]):
98
+ responses = []
99
+
100
+ for i in range(len(competences)):
101
+ transcript = transcripts[i]
102
+
103
+ response = ""
104
+ for idx, chat in enumerate(transcript):
105
+ # logger.info(chat)
106
+ response += chat["answer"]
107
+
108
+ if idx < len(transcript) - 1:
109
+ response += "\n"
110
+
111
+ responses.append(response)
112
+
113
+ return responses
114
+
115
+ def evaluate_interview(competences: list[str], transcript: list, lang: str = 'en'):
116
+ # global tags
117
+ model_inputs = []
118
+
119
+ responses = extract_competences_and_responses(transcript["comp_beha"], transcript["behavioral"])
120
+
121
+ print(len(competences))
122
+ print(len(responses))
123
+
124
+ # pprint(transcript)
125
+
126
+ for i in range(len(transcript["comp_beha"])):
127
+ competence = transcript["comp_beha"][i]
128
+ response = responses[i]
129
+
130
+ text = "KNOWLEDGE:\n"
131
+
132
+ knowledge_exist = False
133
+
134
+ text += f"\nCOMPETENCE: {competence}\n\n"
135
+
136
+ text += f"RESPONSE:\n{response}"
137
+
138
+ model_inputs.append(text)
139
+ print("------")
140
+ ## TODO: change to gpt
141
+
142
+ idn = """
143
+ CONTOH 1:
144
+ KETERAMPILAN YANG DINILAI: Kejujuran
145
+ PEWAWANCARA:
146
+ Apa mimpi burukmu?
147
+ PESERTA WAWANCARA:
148
+ Saya tidak punya mimpi buruk.
149
+ Penilaian: Tidak mungkin seseorang tidak pernah mengalami mimpi buruk. Rasa takut terhadap sesuatu adalah hal yang umum dirasakan manusia.
150
+ Skor: 0.1
151
+
152
+ CONTOH 2:
153
+ PEWAWANCARA:
154
+ Bisakah Anda menceritakan saat Anda harus men-debug masalah yang sangat sulit di lingkungan produksi?
155
+ PESERTA WAWANCARA:
156
+ Di pekerjaan saya sebelumnya, kami menggunakan arsitektur berbasis mikroservis yang dideploy di Kubernetes. Suatu pagi, kami mulai menerima peringatan bahwa layanan autentikasi pengguna kami gagal secara intermiten, dan pengguna tidak bisa masuk.
157
+ Sebagai engineer yang sedang bertugas, tanggung jawab saya adalah segera mengidentifikasi akar permasalahan dan mengembalikan layanan ke fungsionalitas penuh tanpa memengaruhi layanan lain yang bergantung padanya.
158
+ Saya mulai dengan memeriksa log di Kibana dan melihat bahwa beberapa pod untuk layanan autentikasi terus-menerus restart. Saya lalu memeriksa metrik penggunaan resource di Prometheus dan melihat lonjakan memori sebelum setiap crash. Saya curiga terjadi memory leak akibat perubahan terbaru, jadi saya rollback ke image container sebelumnya untuk menstabilkan layanan.
159
+ Setelah stabil, saya menelusuri commit terbaru dan menemukan penggunaan session store in-memory baru yang tidak melepaskan sesi lama dengan benar. Saya menulis skrip analisis heap dump cepat, mengonfirmasi kebocoran memori tersebut, dan memperbaiki session store dengan cache LRU yang terbatas.
160
+ Perbaikannya dideploy di hari yang sama, dan masalah tidak pernah terjadi lagi. Laporan postmortem yang saya tulis juga mendorong tim untuk mengadopsi profiling memori untuk semua komponen layanan baru. Waktu penyelesaian insiden kami meningkat sekitar 30% di kuartal berikutnya berkat perbaikan proses tersebut.
161
+ """
162
+
163
+ en = """
164
+ Here are 2 examples:
165
+ EXAMPLE 1:
166
+ SKILL TO BE EVALUATED: Honest
167
+ INTERVIEWER:
168
+ What are your nightmare?
169
+ INTERVIEWEE:
170
+ I do not have night mare
171
+ Judgement: It is impossible to some not having any nightmare. Scary of something is common human feels.
172
+ Score: 0.1
173
+
174
+ EXAMPLE 2:
175
+ INTERVIEWER:
176
+ Can you tell me about a time you had to debug a particularly difficult issue in a production environment?
177
+ INTERVIEWEE:
178
+ At my previous job, we had a microservices-based architecture deployed on Kubernetes. One morning, we started getting alerts that our user authentication service was intermittently failing, and users couldn’t log in.
179
+ As the engineer on call, my responsibility was to quickly identify the root cause and restore the service to full functionality without affecting other dependent services.
180
+ I began by checking the logs in Kibana and noticed that some of the pods for the authentication service were repeatedly restarting. I then checked the resource usage metrics in Prometheus and saw a memory spike before each crash. I suspected a memory leak introduced by a recent change, so I rolled back to the previous container image to stabilize the service.
181
+ After stabilizing, I dug deeper into the recent commits and found a new in-memory session store that was not properly releasing old sessions. I wrote a quick heap dump analysis script, confirmed the leak, and patched the session store to use a bounded LRU cache instead.
182
+ The fix was deployed the same day, and the issue never recurred. The postmortem I wrote also led to the team adopting memory profiling for all new service components. Our incident resolution time improved by about 30% over the next quarter due to those process improvements.
183
+
184
+ RETURN IN FORMAT BELOW:
185
+ {
186
+ value: [{
187
+ "Judgement": "It is impossible to some not having any nightmare. Scary of something is common human feels. Means he was lying",
188
+ "score": 0.1
189
+ },
190
+ {
191
+ "Judgement: "The candidate delivered a clear, concise STAR response that effectively demonstrated strong technical skills, composure under pressure, and a methodical approach to problem-solving in a production environment. The use of appropriate tools (Kibana, Prometheus), the decision to roll back, and the successful root cause analysis showed depth of experience. The result was measurable and impactful, indicating not just resolution but long-term improvement. Slightly more context on user or business impact would make it perfect, but overall, this is an excellent response that would strongly support a hiring decision."
192
+ "score": 0.95
193
+ }
194
+ ]
195
+ }
196
+ """
197
+ result = gpt_evaluator(model_inputs, en if lang == 'en' else idn,
198
+ Evaluations
199
+ )
200
+ ## output:
201
+ final_score = 0
202
+ behavioral_scores = generate_behavioral_score(result)
203
+ technical_scores = generate_technical_score(transcript["comp_tech"], transcript["technical"])
204
+
205
+ final_score = aggregate_scores(behavioral_scores, technical_scores)
206
+
207
+ return EvalResult(final_score=final_score, details=result)
208
+
209
+ def aggregate_scores(b: list[int], t: list[int]):
210
+ total_score = 0
211
+ alls = b + t
212
+ for i in range(len(alls)):
213
+ score = alls[i]
214
+ total_score += score
215
+
216
+
217
+ return (total_score / len(b)) * 100
218
+
219
+
220
+ def generate_behavioral_score(eval_array):
221
+ print(eval_array)
222
+ scores = []
223
+
224
+ for eval in eval_array:
225
+ scores.append(eval.score)
226
+
227
+ return scores
228
+
229
+ def generate_technical_score(skills: str, transcript: str):
230
+ # total_score = 0
231
+ scores = []
232
+ for idx, skill in enumerate(skills):
233
+ chat = transcript[idx]
234
+ if len(chat) > 0:
235
+ # print(chat)
236
+ transcript_text = f"INTERVIEWEE:\n{chat[0]['question'].lstrip('TECHNICAL: ')}\n\nINTERVIEWER:\n{chat[0]['answer']}"
237
+ # TODO: change to structured output
238
+ model_parameters = generate_model_parameters(skill, transcript_text)
239
+ completion = client.chat.completions.create(
240
+ **model_parameters
241
+ )
242
+
243
+ generated = completion.choices[0].message.content
244
+ score = 1 if "SUCCESS" in generated else 0
245
+ # total_score += score
246
+ scores.append(score)
247
+ else:
248
+ scores.append(-1)
249
+
250
+ return scores