Spaces:
Sleeping
Sleeping
File size: 8,054 Bytes
2f4f4ef ccada5d 2f4f4ef 66b3adb 2f4f4ef ccada5d 8d43ad3 26774a0 c56bd4a 26774a0 ccada5d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | import gradio as gr
import pandas as pd
import re
import numpy as np
import importlib
from pandas import json_normalize
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from pandas import json_normalize
### Parameters not expected to be changed in every run
# columns to use for embeddings on table 1
columns_embeddings_col1 = ['Indicator Name']
# columns to use for embeddings on table 2
columns_embeddings_col2 = ['Indicator name (leonardo)']
#### Functions
from numpy.linalg import norm
print("Functions loaded")
# Define cosine similarity function
cos_sim = lambda a, b: (a @ b.T) / (norm(a) * norm(b))
def concatenate_columns(df, columns):
# Check if all specified columns exist in the DataFrame
if not all(col in df.columns for col in columns):
raise ValueError("One or more specified columns do not exist in the DataFrame")
# Concatenate the specified columns with a period as the separator
df['concatenated_input'] = df[columns].astype(str).agg('.'.join, axis=1)
return df
# Define the function for mean pooling
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains last hidden states
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
# Define your get_embbedings function
def get_embbedings(table, colname):
# Initialize tokenizer and model
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
# Tokenize sentences
encoded_input = tokenizer(table[colname].tolist(), padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings
with torch.no_grad():
model_output = model(**encoded_input)
# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
return sentence_embeddings
# Process similarity
def process_similarity_results(table1, table2,columns_embeddings_col1,columns_embeddings_col2, harmonization=True):
if 'Indicator ID' in table1.columns:
table1['ID'] = table1['Indicator ID'].astype(str)
else:
table1['ID'] = table1['ID'].astype(str)
if 'Indicator ID' in table2.columns:
table2['ID'] = table2['Indicator ID'].astype(str)
else:
table2['ID'] = table2['ID'].astype(str)
print(columns_embeddings_col1)
table1 = concatenate_columns(table1, columns= columns_embeddings_col1)
table2 = concatenate_columns(table2,columns= columns_embeddings_col2)
embeddings1 = get_embbedings(table1, 'concatenated_input')
embeddings2 = get_embbedings(table2,'concatenated_input')
# Calculate cosine similarity between the embeddings
similarities = cos_sim(embeddings1, embeddings2)
# Create a DataFrame for the similarities
result_df = pd.DataFrame(similarities,
columns=table2['ID'],
index=table1['ID'])
if harmonization:
# Mapping frameworks
table1_sel_map = table1.set_index('ID')['Framework'].to_dict()
table2_sel_map = table2.set_index('ID')['Framework'].to_dict()
# Function to check if there is any common framework element
def has_common_framework(table1_framework, table2_framework):
table1_frameworks = set(table1_framework.split(', '))
table2_frameworks = set(table2_framework.split(', '))
return not table1_frameworks.isdisjoint(table2_frameworks)
# Replace similarity values with NaN where the frameworks match
for table1_id, table1_framework in table1_sel_map.items():
for table2_id in result_df.columns:
table2_framework = table2_sel_map.get(table2_id)
if pd.notna(table2_framework) and pd.notna(table1_framework):
if has_common_framework(table1_framework, table2_framework):
result_df.loc[table1_id, table2_id] = np.nan
# Function to return the column names of the top 5 values for each row
def top_5_column(row):
# Find the top 5 values in the row
top_5_values = row.nlargest(5)
# Return the column names corresponding to these values
return top_5_values.index.tolist()
# Convert all columns to numeric data types, coercing non-convertible values to NaN
#result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')
# Get the list of non-numeric columns
#non_numeric_columns = result_df.columns[result_df.dtypes == 'object']
# Apply the function to each row of the DataFrame, excluding non-numeric columns
result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)
# Create a dictionary for fast lookup
id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))
# Function to map IDs to names
def map_ids_to_names(id_list):
return [id_to_name.get(id, "ID") for id in id_list]
# Apply the function to the 'Top 5 Column ID' column
result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)
# Ensure all entries are lists and have at least 5 elements, filling missing values with None
result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))
# Convert list in 'Top 5 Names' to separate columns
new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=["top1name", "top2name", "top3name", "top4name", "top5name"])
result_df = result_df.join(new_cols)
# Ensure all entries are lists and have exactly 5 elements, filling missing values with None
result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(
lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)
)
# Convert list in 'Top 5 Column ID' to separate columns
new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=["top1id", "top2id", "top3id", "top4id", "top5id"])
result_df = result_df.join(new_ids_cols)
result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)
# Calculate min and max of the 'max_sim' column, ignoring NaN values
min_val = np.nanmin(result_df['max_sim'])
max_val = np.nanmax(result_df['max_sim'])
# Normalize the 'max_sim' values
result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)
result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',
'top2id', 'top3id', 'top4id', 'top5id']]
# Merge the DataFrames
result_final = table1[['ID', 'Indicator Name', 'Framework']].merge(result_final, on='ID', how='left')
# Create a mapping from ID to Framework
id_to_framework = table2.set_index('ID')['Framework'].to_dict()
# Function to map ID to Framework
def map_framework(id):
return id_to_framework.get(id, np.nan)
# Add framework information for top1id to top5id
result_final['top1framework'] = result_final['top1id'].apply(map_framework)
result_final['top2framework'] = result_final['top2id'].apply(map_framework)
result_final['top3framework'] = result_final['top3id'].apply(map_framework)
result_final['top4framework'] = result_final['top4id'].apply(map_framework)
result_final['top5framework'] = result_final['top5id'].apply(map_framework)
return result_final
|