Spaces:
Build error
Build error
Upload 3 files
Browse files- data_processing.py +4 -4
- evaluation.py +71 -34
data_processing.py
CHANGED
|
@@ -76,11 +76,11 @@ def create_faiss_index(dataset):
|
|
| 76 |
|
| 77 |
def load_ragbench():
|
| 78 |
global ragbench
|
| 79 |
-
if ragbench
|
| 80 |
return ragbench
|
| 81 |
-
|
| 82 |
-
'
|
| 83 |
-
|
| 84 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
| 85 |
return ragbench
|
| 86 |
|
|
|
|
| 76 |
|
| 77 |
def load_ragbench():
|
| 78 |
global ragbench
|
| 79 |
+
if ragbench:
|
| 80 |
return ragbench
|
| 81 |
+
datasets = ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
|
| 82 |
+
'tatqa', 'techqa']
|
| 83 |
+
for dataset in datasets:
|
| 84 |
ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
|
| 85 |
return ragbench
|
| 86 |
|
evaluation.py
CHANGED
|
@@ -11,40 +11,40 @@ global ground_truth_answer, ground_truth_metrics
|
|
| 11 |
ground_truth_answer = ''
|
| 12 |
ground_truth_metrics = {}
|
| 13 |
|
| 14 |
-
def calculate_metrics(question, response, docs, time_taken):
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def retrieve_ground_truths(question,ragbench_set):
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
# Step 1: Helper function to compute cosine similarity
|
| 50 |
def compute_cosine_similarity(text1, text2):
|
|
@@ -91,4 +91,41 @@ def adherence(response, relevant_documents):
|
|
| 91 |
def compute_rmse(predicted_values, ground_truth_values):
|
| 92 |
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
|
|
|
|
|
| 11 |
ground_truth_answer = ''
|
| 12 |
ground_truth_metrics = {}
|
| 13 |
|
| 14 |
+
# def calculate_metrics(question, response, docs, time_taken):
|
| 15 |
+
# data = load_ragbench()
|
| 16 |
+
# retrieve_ground_truths(question, data)
|
| 17 |
+
# # Predicted metrics
|
| 18 |
+
# predicted_metrics = {
|
| 19 |
+
# "ground_truth": ground_truth_answer,
|
| 20 |
+
# "context_relevance": context_relevance(question, docs),
|
| 21 |
+
# "context_utilization": context_utilization(response, docs),
|
| 22 |
+
# "completeness": completeness(response, ground_truth_answer),
|
| 23 |
+
# "adherence": adherence(response, docs),
|
| 24 |
+
# "response_time" : time_taken
|
| 25 |
+
# }
|
| 26 |
+
# return predicted_metrics
|
| 27 |
+
|
| 28 |
+
# def retrieve_ground_truths(question,ragbench_set):
|
| 29 |
+
# for dataset_name in ragbench_set.keys():
|
| 30 |
+
# for split_name,instances in ragbench_set[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
| 31 |
+
# print(f"Processing {split_name} split")
|
| 32 |
+
# for instance in instances: # Fixed: Corrected indentation
|
| 33 |
+
# # Check if the question (data) matches the query
|
| 34 |
+
# if instance['question'] == question:
|
| 35 |
+
# # If a match is found, retrieve id and response
|
| 36 |
+
# instance_id = instance['id']
|
| 37 |
+
# instance_response = instance['response']
|
| 38 |
+
# ground_truth_metrics = {
|
| 39 |
+
# "context_relevance": instance['relevance_score'],
|
| 40 |
+
# "context_utilization": instance['utilization_score'],
|
| 41 |
+
# "completeness": instance['completeness_score'],
|
| 42 |
+
# "adherence": instance['adherence_score']
|
| 43 |
+
# }
|
| 44 |
+
# ground_truth_answer = instance_response
|
| 45 |
+
# print(f"Match found in {split_name} split!")
|
| 46 |
+
# print(f"ID: {instance_id}, Response: {instance_response}")
|
| 47 |
+
# break # Exit after finding the first match (optional)
|
| 48 |
|
| 49 |
# Step 1: Helper function to compute cosine similarity
|
| 50 |
def compute_cosine_similarity(text1, text2):
|
|
|
|
| 91 |
def compute_rmse(predicted_values, ground_truth_values):
|
| 92 |
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
| 93 |
|
| 94 |
+
def calculate_metrics(question, response, docs, time_taken):
|
| 95 |
+
data = load_ragbench()
|
| 96 |
+
ground_truth_answer = retrieve_ground_truths(question, data) # Store the ground truth answer
|
| 97 |
+
|
| 98 |
+
# Ensure ground_truth_answer is not empty before proceeding
|
| 99 |
+
if ground_truth_answer is None:
|
| 100 |
+
ground_truth_answer = "" # Default to an empty string if no ground truth is found
|
| 101 |
+
|
| 102 |
+
# Predicted metrics
|
| 103 |
+
predicted_metrics = {
|
| 104 |
+
"ground_truth": ground_truth_answer,
|
| 105 |
+
"context_relevance": context_relevance(question, docs),
|
| 106 |
+
"context_utilization": context_utilization(response, docs),
|
| 107 |
+
"completeness": completeness(response, ground_truth_answer),
|
| 108 |
+
"adherence": adherence(response, docs),
|
| 109 |
+
"response_time": time_taken
|
| 110 |
+
}
|
| 111 |
+
return predicted_metrics
|
| 112 |
+
|
| 113 |
+
def retrieve_ground_truths(question, ragbench_set):
|
| 114 |
+
for dataset_name in ragbench_set.keys():
|
| 115 |
+
for split_name, instances in ragbench_set[dataset_name].items():
|
| 116 |
+
print(f"Processing {split_name} split")
|
| 117 |
+
for instance in instances:
|
| 118 |
+
if instance['question'] == question:
|
| 119 |
+
instance_id = instance['id']
|
| 120 |
+
instance_response = instance['response']
|
| 121 |
+
# ground_truth_metrics = {
|
| 122 |
+
# "context_relevance": instance['relevance_score'],
|
| 123 |
+
# "context_utilization": instance['utilization_score'],
|
| 124 |
+
# "completeness": instance['completeness_score'],
|
| 125 |
+
# "adherence": instance['adherence_score']
|
| 126 |
+
# }
|
| 127 |
+
print(f"Match found in {split_name} split!")
|
| 128 |
+
print(f"ID: {instance_id}, Response: {instance_response}")
|
| 129 |
+
return instance_response # Return ground truth response immediately
|
| 130 |
|
| 131 |
+
return None # Return None if no match is found
|