File size: 8,054 Bytes
2f4f4ef
 
 
 
 
 
 
 
 
ccada5d
2f4f4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b3adb
 
2f4f4ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccada5d
 
 
 
8d43ad3
26774a0
 
 
 
 
 
 
 
 
 
 
c56bd4a
26774a0
 
ccada5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import gradio as gr
import pandas as pd
import re
import numpy as np
import importlib
from pandas import json_normalize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from pandas import json_normalize

### Parameters not expected to be changed in every run

# columns to use for embeddings on table 1

columns_embeddings_col1 = ['Indicator Name']

# columns to use for embeddings on table 2
columns_embeddings_col2 = ['Indicator name (leonardo)']


#### Functions

from numpy.linalg import norm

print("Functions loaded")

# Define cosine similarity function
cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))

def concatenate_columns(df, columns):
    # Check if all specified columns exist in the DataFrame
    if not all(col in df.columns for col in columns):
        raise ValueError("One or more specified columns do not exist in the DataFrame")

    # Concatenate the specified columns with a period as the separator
    df['concatenated_input'] = df[columns].astype(str).agg('.'.join, axis=1)
    return df


# Define the function for mean pooling
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains last hidden states
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Define your get_embbedings function
def get_embbedings(table, colname):
    # Initialize tokenizer and model
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

    # Tokenize sentences
    encoded_input = tokenizer(table[colname].tolist(), padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    
    return sentence_embeddings


# Process similarity

def process_similarity_results(table1, table2,columns_embeddings_col1,columns_embeddings_col2, harmonization=True):
    
    if 'Indicator ID' in table1.columns:
        table1['ID'] = table1['Indicator ID'].astype(str)
    else:
        table1['ID'] = table1['ID'].astype(str) 

    if 'Indicator ID' in table2.columns:
        table2['ID'] = table2['Indicator ID'].astype(str)
    else:
        table2['ID'] = table2['ID'].astype(str)

    print(columns_embeddings_col1)
    table1 = concatenate_columns(table1, columns= columns_embeddings_col1)
    table2 = concatenate_columns(table2,columns= columns_embeddings_col2)

    embeddings1 = get_embbedings(table1, 'concatenated_input')
    embeddings2 = get_embbedings(table2,'concatenated_input')

    # Calculate cosine similarity between the embeddings
    similarities = cos_sim(embeddings1, embeddings2)

    # Create a DataFrame for the similarities
    result_df = pd.DataFrame(similarities, 
                            columns=table2['ID'],
                            index=table1['ID'])
    if harmonization:
        
        # Mapping frameworks
        table1_sel_map = table1.set_index('ID')['Framework'].to_dict()
        table2_sel_map = table2.set_index('ID')['Framework'].to_dict()

        # Function to check if there is any common framework element
        def has_common_framework(table1_framework, table2_framework):
            table1_frameworks = set(table1_framework.split(', '))
            table2_frameworks = set(table2_framework.split(', '))
            return not table1_frameworks.isdisjoint(table2_frameworks)

        # Replace similarity values with NaN where the frameworks match
        for table1_id, table1_framework in table1_sel_map.items():
            for table2_id in result_df.columns:
                table2_framework = table2_sel_map.get(table2_id)
                if pd.notna(table2_framework) and pd.notna(table1_framework):
                    if has_common_framework(table1_framework, table2_framework):
                        result_df.loc[table1_id, table2_id] = np.nan



    # Function to return the column names of the top 5 values for each row
    def top_5_column(row):
        # Find the top 5 values in the row
        top_5_values = row.nlargest(5)
        # Return the column names corresponding to these values
        return top_5_values.index.tolist()

    # Convert all columns to numeric data types, coercing non-convertible values to NaN
    #result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')

    # Get the list of non-numeric columns
    #non_numeric_columns = result_df.columns[result_df.dtypes == 'object']

    # Apply the function to each row of the DataFrame, excluding non-numeric columns
    result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)

    # Create a dictionary for fast lookup
    id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))

    # Function to map IDs to names
    def map_ids_to_names(id_list):
        return [id_to_name.get(id, "ID") for id in id_list]

    # Apply the function to the 'Top 5 Column ID' column
    result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)

    # Ensure all entries are lists and have at least 5 elements, filling missing values with None
    result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))

    # Convert list in 'Top 5 Names' to separate columns
    new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=["top1name", "top2name", "top3name", "top4name", "top5name"])
    result_df = result_df.join(new_cols)

    # Ensure all entries are lists and have exactly 5 elements, filling missing values with None
    result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(
        lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)
    )

    # Convert list in 'Top 5 Column ID' to separate columns
    new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=["top1id", "top2id", "top3id", "top4id", "top5id"])
    result_df = result_df.join(new_ids_cols)

    result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)

    # Calculate min and max of the 'max_sim' column, ignoring NaN values
    min_val = np.nanmin(result_df['max_sim'])
    max_val = np.nanmax(result_df['max_sim'])

    # Normalize the 'max_sim' values
    result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)

    result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',
        'top2id', 'top3id', 'top4id', 'top5id']]
    

    # Merge the DataFrames
    result_final = table1[['ID', 'Indicator Name', 'Framework']].merge(result_final, on='ID', how='left')

    # Create a mapping from ID to Framework
    id_to_framework = table2.set_index('ID')['Framework'].to_dict()

    # Function to map ID to Framework
    def map_framework(id):
        return id_to_framework.get(id, np.nan)

    # Add framework information for top1id to top5id
    result_final['top1framework'] = result_final['top1id'].apply(map_framework)
    result_final['top2framework'] = result_final['top2id'].apply(map_framework)
    result_final['top3framework'] = result_final['top3id'].apply(map_framework)
    result_final['top4framework'] = result_final['top4id'].apply(map_framework)
    result_final['top5framework'] = result_final['top5id'].apply(map_framework)

    return result_final