File size: 4,927 Bytes
40b3335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import json
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
import numpy as np
import pandas as pd
from tqdm import tqdm


#Data
ref_data = "DeepSeek7b_No_WM_test_13860.json"
ref_column = "output_only" 

cand_data = "Dipper_DeepSeek_TW_13860.json"
cand_column ="paraphrased_response"

output_name = "Entity_Dipper_DeepSeek_TW.csv" 
N = 13860

# Use SPACY
# Load spaCy's Named Entity Recognition (NER) model
nlp = spacy.load("en_core_web_sm")

# Function to extract named entities from text
def extract_named_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents]


# === Similarity Calculation ===
def compute_similarity(entity1, entity2):
    if entity1 == entity2:
        return 1.0, 1.0, 1.0

    lev_similarity = SequenceMatcher(None, entity1, entity2).ratio()

    vec1 = nlp(entity1).vector.reshape(1, -1)
    vec2 = nlp(entity2).vector.reshape(1, -1)

    if np.any(vec1) and np.any(vec2):
        cos_similarity = cosine_similarity(vec1, vec2)[0][0]
    else:
        cos_similarity = 0.0

    combined_similarity = (lev_similarity + cos_similarity) / 2
    return combined_similarity, lev_similarity, cos_similarity

# === Greedy Pairwise Matching ===
def greedy_pairwise_matching(ref_entities, cand_entities):
    matched_entities = []
    cand_entities_copy = cand_entities.copy()

    for ref_entity in ref_entities:
        best_match = None
        best_score = 0
        best_lev_similarity = 0
        best_cos_similarity = 0

        for cand_entity in cand_entities_copy:
            similarity_score, lev_similarity, cos_similarity = compute_similarity(ref_entity, cand_entity)

            if similarity_score > best_score:
                best_score = similarity_score
                best_match = cand_entity
                best_lev_similarity = lev_similarity
                best_cos_similarity = cos_similarity

        if best_match:
            matched_entities.append((ref_entity, best_match, best_score, best_lev_similarity, best_cos_similarity))
            cand_entities_copy.remove(best_match)
        else:
            matched_entities.append((ref_entity, "MISSING", 0, 0, 0))

    for cand_entity in cand_entities_copy:
        matched_entities.append(("NEW ENTITY", cand_entity, 0, 0, 0))

    return matched_entities

# === Load Data ===
with open(ref_data, "r", encoding="utf-8") as ref_file:
    reference_data = [entry[ref_column] for entry in json.load(ref_file)[:N]]

with open(cand_data, "r", encoding="utf-8") as cand_file:
    candidate_data = [entry[cand_column] for entry in json.load(cand_file)[:N]]

assert len(reference_data) == len(candidate_data), "Mismatch in data point count!"


# === Process Each Pair ===
results = []

for idx, (ref_text, cand_text) in enumerate(tqdm(zip(reference_data, candidate_data), total=len(reference_data))):
    ref_entities = extract_named_entities(ref_text)
    cand_entities = extract_named_entities(cand_text)

    if len(ref_entities) == 0 and len(cand_entities) == 0:
        continue

    matched_entities = greedy_pairwise_matching(ref_entities, cand_entities)

    # Compute similarity lists
    cosine_similarities = [match[4] for match in matched_entities if match[4] > 0]
    levenshtein_similarities = [match[3] for match in matched_entities if match[3] > 0]

    avg_cosine_similarity = np.mean(cosine_similarities) if cosine_similarities else 0.0
    avg_levenshtein_similarity = np.mean(levenshtein_similarities) if levenshtein_similarities else 0.0
    avg_similarity = (avg_cosine_similarity + avg_levenshtein_similarity) / 2

    # Calculate exact match count
    exact_match_pairs = sum(1 for match in matched_entities if match[0] == match[1])

    # Union count
    union_count = len(ref_entities) + len(cand_entities) - exact_match_pairs
    union_count = union_count if union_count > 0 else 1  # Avoid division by zero

    # Final score
    final_score = (avg_similarity / union_count) * max(len(ref_entities), len(cand_entities))

    results.append({
        "Index": idx,
        "Reference_Entity_Count": len(ref_entities),
        "Candidate_Entity_Count": len(cand_entities),
        "Reference_Entities": ref_entities,
        "Candidate _Entities": cand_entities,
        "Matched_Entities": matched_entities,
        "Exact_Match Pairs": exact_match_pairs,
        "Union_Count": union_count,
        "Average_Cosine_Similarity": avg_cosine_similarity,
        "Average_Levenshtein_Similarity": avg_levenshtein_similarity,
        "Average_Combined_Similarity_Score": avg_similarity,
        "Final_Score": final_score
    })

# === Save to CSV ===
results_df = pd.DataFrame(results)
results_df.to_csv(output_name, index=False)

print(f'Average Final Score: {results_df["Final_Score"].mean()}')