Spaces:

mrfirdauss
/

api-ta

Runtime error

App Files Files Community

m00913563 commited on Apr 30, 2025

Commit

b85afa8

1 Parent(s): 3755055

feat: add evaluator

Browse files

Files changed (3) hide show

app.py +7 -2
evaluator.py +256 -0
models.py +8 -1

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from fastapi import FastAPI, HTTPException
-from models import CVExtracted, InsertedText, JobAndCV, ClassificationResult, InsertedLink
 import os
 from io import BytesIO
 # import extractor
@@ -8,6 +8,7 @@ from datetime import datetime
 from pypdf import PdfReader
 import requests
 import classificator
 os.environ['TRANSFORMERS_CACHE'] = '/transformers_cache'
 os.environ['HF_HOME'] = '/transformers_cache'
@@ -86,4 +87,8 @@ async def extract(link: InsertedLink):
         raise HTTPException(status_code=response.status_code, detail="File server error")
     dictresult = extractor_llm.predict(text)
-    return dictresult

 from fastapi import FastAPI, HTTPException
+from models import CVExtracted, EvaModul, JobAndCV, ClassificationResult, InsertedLink
 import os
 from io import BytesIO
 # import extractor
 from pypdf import PdfReader
 import requests
 import classificator
+import evaluator
 os.environ['TRANSFORMERS_CACHE'] = '/transformers_cache'
 os.environ['HF_HOME'] = '/transformers_cache'
         raise HTTPException(status_code=response.status_code, detail="File server error")
     dictresult = extractor_llm.predict(text)
+    return dictresult
+@app.post("/eval", response_model=float)
+async def eval(eva: EvaModul):
+    return evaluator.evaluate_interview(competences=eva.competences, transcript=eva.transcript)

evaluator.py ADDED Viewed

	@@ -0,0 +1,256 @@

+from openai import OpenAI
+from models import Evaluation
+tags = {'AI': "This one is the competence description"} #list of competence to save, better to hit db.
+client = OpenAI()
+def generate_model_parameters(skill: str, transcript: str):
+    model_parameters = {
+  "model":"gpt-4-0125-preview",
+  "messages":[
+    {"role": "system", "content": f"""
+You are tasked with evaluating a transcript of an IT job interview. The interview that is conducted in the transcript is technical.
+You need sufficient IT knowledge since you will evaluate the answer of the interviewee to determine whether the interviewee answer correctly or not.
+You will output "SUCCESS" if the interviewee's answer is deemed correct and "FAIL" if it's deemed false.
+Below are 5 examples of correct answers.
+Here are 5 examples:
+EXAMPLE 1:
+SKILL TO BE EVALUATED: Python
+INTERVIEWER:
+What is the use of zip () in python?
+INTERVIEWEE:
+The zip returns an iterator and takes iterable as argument. These iterables can be list, tuple, dictionary etc. It maps similar index of every iterable to make a single entity.
+OUTPUT: SUCCESS
+EXAMPLE 2:
+SKILL TO BE EVALUATED: Python
+INTERVIEWER:
+What will be the output of the following?
+name=["swati","shweta"]
+age=[10,20]
+new_entity-zip(name,age)
+new_entity-set(new_entity)
+print(new_entity)
+INTERVIEWEE:
+The output is {{('shweta', 20), ('swati', 10)}}
+OUTPUT: SUCCESS
+EXAMPLE 3:
+SKILL TO BE EVALUATED: Python
+INTERVIEWER:
+What will be the output of the following?
+a=["1","2","3"]
+b=["a","b","c"]
+c=[x+y for x, y in zip(a,b)] print(c)
+INTERVIEWEE:
+The output is: ['1a', '2b', '3c']
+OUTPUT: SUCCESS
+EXAMPLE 4:
+SKILL TO BE EVALUATED: Python
+INTERVIEWER:
+What will be the output of the following?
+str="apple#banana#kiwi#orange"
+print(str.split("#",2))
+INTERVIEWEE:
+['apple', 'banana', 'kiwi#orange']
+OUTPUT: SUCCESS
+EXAMPLE 5:
+SKILL TO BE EVALUATED: Python
+INTERVIEWER:
+What are python modules? Name some commonly used built-in modules in Python?
+INTERVIEWEE:
+Python modules are files containing Python code. This code can either be function classes or variables. A Python module is a .py file containing executable code. Some of the commonly used built-in modules are:
+- os
+- sys
+- math
+- random
+- data time
+- json
+OUTPUT: SUCCESS
+Note that the examples that I give above have the correct answer. Your job is to generate the output only (SUCCESS OR FAIL). You don't need to explain your justification.
+SKILL TO BE EVALUATED: {skill}
+{transcript}
+"""},
+  ]
+}
+    return model_parameters
+def gpt_evaluator(payload, fewshot, response_format):
+	response = client.beta.chat.completions.parse(
+        model="gpt-4o",
+        messages=[
+            {"role": "system", "content": fewshot},
+            {"role": "user", "content": input},
+        ],
+        response_format=response_format,
+    )
+	return response.choices[0].message.parsed
+def extract_competences_and_responses(competences: list[str], transcripts: list[dict]):
+    responses = []
+    for i in range(len(competences)):
+        transcript = transcripts[i]
+        response = ""
+        for idx, chat in enumerate(transcript):
+            # logger.info(chat)
+            response += chat["answer"]
+            if idx < len(transcript) - 1:
+                response += "\n"
+        responses.append(response)
+    return responses
+def evaluate_interview(competences: list[str], transcript: list):
+    global tags
+    model_inputs = []
+    responses = extract_competences_and_responses(competences, transcript["behavioral"])
+    print(len(competences))
+    print(len(responses))
+    # pprint(transcript)
+    for i in range(len(competences)):
+        competence = competences[i]
+        response = responses[i]
+        text = "KNOWLEDGE:\n"
+        matching_tags_text_competence = {tag for tag in tags if tag in competence}
+        matching_tags_text_response = {tag for tag in tags if tag in response}
+        matching_tags = matching_tags_text_competence.union(matching_tags_text_response)
+        knowledge_exist = False
+        for tag in matching_tags:
+            knowledge_text = tags[tag]
+            if "UNKNOWN TAG" not in knowledge_text:
+                text += knowledge_text
+                text += "\n"
+                knowledge_exist = True
+        if not knowledge_exist:
+            text +="None\n"
+        text += f"\nCOMPETENCE: {competence}\n\n"
+        text += f"RESPONSE:\n{response}"
+        model_inputs.append(text)
+        # print(text)
+        print("------")
+    ## TODO: change to gpt
+    result = gpt_evaluator(model_inputs,
+                """
+                Here are 5 examples:
+                EXAMPLE 1:
+                SKILL TO BE EVALUATED: Honest
+                INTERVIEWER:
+                What are your nightmare?
+                INTERVIEWEE:
+                I Do not have night mare
+                OUTPUT: FAIL
+                Always send output in format "FAIL" or "SUCCESS"
+                """,
+                [Evaluation]
+    )
+    ## output:
+    final_score = 0
+    behavioral_scores = generate_behavioral_score(result)
+    technical_scores = generate_technical_score(competences, transcript["technical"])
+    final_score = aggregate_scores(behavioral_scores, technical_scores)
+    return final_score
+def aggregate_scores(b: list[int], t: list[int]):
+    total_score = 0
+    for i in range(len(b)):
+        score = 0
+        if t[i] != -1:
+            score = (b[i] + t[i]) / 2
+        else:
+            score = b[i]
+        total_score += score
+    return (total_score / len(b)) * 100
+def generate_behavioral_score(eval_array):
+    scores = []
+    for eval in eval_array:
+        fail_score = 0
+        success_score = 0
+        if eval[0]["label"] == "FAIL":
+            fail_score = eval[0]["score"]
+        elif eval[0]["label"] == "SUCCESS":
+            success_score = eval[0]["score"]
+        if eval[1]["label"] == "FAIL":
+            fail_score = eval[1]["score"]
+        elif eval[1]["label"] == "SUCCESS":
+            success_score = eval[1]["score"]
+        if fail_score < success_score:
+            scores.append(1)
+        else:
+            scores.append(0)
+    return scores
+def generate_technical_score(skills: str, transcript: str):
+    # total_score = 0
+    scores = []
+    for idx, skill in enumerate(skills):
+        chat = transcript[idx]
+        if len(chat) > 0:
+            # print(chat)
+            transcript_text = f"INTERVIEWEE:\n{chat[0]['question'].lstrip('TECHNICAL: ')}\n\nINTERVIEWER:\n{chat[0]['answer']}"
+            # TODO: change to structured output
+            model_parameters = generate_model_parameters(skill, transcript_text)
+            completion = client.chat.completions.create(
+                **model_parameters
+            )
+            generated = completion.choices[0].message.content
+            score = 1 if "SUCCESS" in generated else 0
+            # total_score += score
+            scores.append(score)
+        else:
+            scores.append(-1)
+    return scores

models.py CHANGED Viewed

@@ -58,4 +58,11 @@ class ClassificationResult(BaseModel):
     score: float
 class InsertedLink(BaseModel):
-    link: str

     score: float
 class InsertedLink(BaseModel):
+    link: str
+class Evaluation(BaseModel):
+    label: str
+class EvaModul(BaseModel):
+    competences: list[str]
+    transcript: dict