File size: 2,233 Bytes
49988f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util

# from mydifflib import get_close_matches

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu")
print(f"Device: {device}")

emb_model = SentenceTransformer('stsb-roberta-large', device=device)

def eval_sample(id, sample, choice, scores, questions, answers, answer_dne, temp_choice_list, threshold=0.85):
    questions_emb = emb_model.encode(questions)
    facts_emb = emb_model.encode(sample["facts"])
    facts_count = [0]*len(sample["facts"])
    answers_expanded, answers_count = [], []
    
    for answer in answers:
        answer = [a for a in answer.split('. ') if not a.isnumeric()]  # split the answer into atomic facts
        answers_expanded.extend(answer)
        answers_count.append(len(answer))
    answers_emb = emb_model.encode(answers_expanded)

    output_dict = {
        "id": id,
        "info": sample,
        "interactive_system": {
            "choice": choice,
            "confidence_scores": scores,
            "questions": questions,
            "answers": answers,
            "answer_dne": answer_dne,
            "num_questions": len(questions),
            "intermediate_choices": temp_choice_list,
        },
        "eval": {
            "repeat_question_score": [],
            "repeat_answer_score": [],
            "relevancy_score": [],
            "delta_confidence_score": [],
            "specificity_score": []
        }
    }
    
    # Example placeholder for evaluation metrics computation
    for i in range(len(questions)):
        output_dict["eval"]["repeat_question_score"].append(np.random.random())  # Placeholder
        output_dict["eval"]["repeat_answer_score"].append(np.random.random())  # Placeholder
        output_dict["eval"]["relevancy_score"].append(np.random.random())  # Placeholder
        output_dict["eval"]["delta_confidence_score"].append(np.random.random())  # Placeholder
        output_dict["eval"]["specificity_score"].append(np.random.random())  # Placeholder

    return output_dict

# Other functions should be similarly reviewed and implemented