Spaces:
Sleeping
Sleeping
create indicator recommend app
Browse files- app.py +38 -0
- functions.py +114 -5
- indicator_harmonizer.ipynb +12 -198
- requirements +9 -0
app.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
def process_tables(table1_path, table2_path, columns_embeddings_col1, columns_embeddings_col2, harmonization):
|
| 5 |
+
# Assuming process_similarity_results is a function defined in the uploaded functions.py
|
| 6 |
+
# that we've adapted based on previous discussions.
|
| 7 |
+
import functions
|
| 8 |
+
|
| 9 |
+
# Load tables
|
| 10 |
+
table1 = pd.read_csv(table1_path)
|
| 11 |
+
table2 = pd.read_csv(table2_path)
|
| 12 |
+
|
| 13 |
+
# You would include here the logic from your Jupyter notebook
|
| 14 |
+
# This should involve any preprocessing, calling the similarity function, etc.
|
| 15 |
+
result_final = functions.process_similarity_results(table1, table2, harmonization)
|
| 16 |
+
|
| 17 |
+
# Export to CSV
|
| 18 |
+
result_path = "/mnt/data/result_final.csv"
|
| 19 |
+
result_final.to_csv(result_path, index=False)
|
| 20 |
+
return result_path
|
| 21 |
+
|
| 22 |
+
# Define Gradio interface
|
| 23 |
+
iface = gr.Interface(
|
| 24 |
+
fn=process_tables,
|
| 25 |
+
inputs=[
|
| 26 |
+
gr.File(label="Upload Table 1 (Client Indicators or Framework Table)"),
|
| 27 |
+
gr.File(label="Upload Table 2 (Internal Indicator or Indicator Table)"),
|
| 28 |
+
gr.Textbox(label="Columns for Embeddings in Table 1", default="Indicator Name", placeholder="Enter column names separated by commas"),
|
| 29 |
+
gr.Textbox(label="Columns for Embeddings in Table 2", default="Indicator name (leonardo)", placeholder="Enter column names separated by commas"),
|
| 30 |
+
gr.Checkbox(label="Harmonization Mode", value=True)
|
| 31 |
+
],
|
| 32 |
+
outputs=[
|
| 33 |
+
gr.File(label="Download Processed Results")
|
| 34 |
+
],
|
| 35 |
+
description="Upload two tables and process them based on the selected parameters. If harmonization, set Table 1 as the Framework Table and Table 2 as the Indicator Table. Otherwise, set Table 1 as the Client Indicators Table and Table 2 as the Internal Indicator Table."
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
iface.launch()
|
functions.py
CHANGED
|
@@ -9,6 +9,9 @@ from pandas import json_normalize
|
|
| 9 |
from transformers import AutoTokenizer, AutoModel
|
| 10 |
import torch
|
| 11 |
import torch.nn.functional as F
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
### Parameters not expected to be changed in every run
|
| 14 |
|
|
@@ -20,11 +23,6 @@ columns_embeddings_col1 = ['Indicator Name']
|
|
| 20 |
columns_embeddings_col2 = ['Indicator name (leonardo)']
|
| 21 |
|
| 22 |
|
| 23 |
-
# ID column
|
| 24 |
-
|
| 25 |
-
table1_id_col = ['ID']
|
| 26 |
-
table2_id_col = ['ID']
|
| 27 |
-
|
| 28 |
#### Functions
|
| 29 |
|
| 30 |
from numpy.linalg import norm
|
|
@@ -71,3 +69,114 @@ def get_embbedings(table, colname):
|
|
| 71 |
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
| 72 |
|
| 73 |
return sentence_embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
from transformers import AutoTokenizer, AutoModel
|
| 10 |
import torch
|
| 11 |
import torch.nn.functional as F
|
| 12 |
+
from seatable_api import Base, context
|
| 13 |
+
from pandas import json_normalize
|
| 14 |
+
import importlib
|
| 15 |
|
| 16 |
### Parameters not expected to be changed in every run
|
| 17 |
|
|
|
|
| 23 |
columns_embeddings_col2 = ['Indicator name (leonardo)']
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
#### Functions
|
| 27 |
|
| 28 |
from numpy.linalg import norm
|
|
|
|
| 69 |
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
| 70 |
|
| 71 |
return sentence_embeddings
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# Process similarity
|
| 75 |
+
|
| 76 |
+
def process_similarity_results(table1, table2, harmonization=True):
|
| 77 |
+
|
| 78 |
+
embeddings1 = get_embbedings(table1, 'concatenated_input')
|
| 79 |
+
embeddings2 = get_embbedings(table2,'concatenated_input')
|
| 80 |
+
|
| 81 |
+
# Calculate cosine similarity between the embeddings
|
| 82 |
+
similarities = cos_sim(embeddings1, embeddings2)
|
| 83 |
+
|
| 84 |
+
# Create a DataFrame for the similarities
|
| 85 |
+
result_df = pd.DataFrame(similarities,
|
| 86 |
+
columns=table2['ID'],
|
| 87 |
+
index=table1['ID'])
|
| 88 |
+
if harmonization:
|
| 89 |
+
|
| 90 |
+
# Mapping frameworks
|
| 91 |
+
table1_sel_map = table1.set_index('ID')['Framework'].to_dict()
|
| 92 |
+
table2_sel_map = table2.set_index('ID')['Framework'].to_dict()
|
| 93 |
+
|
| 94 |
+
# Function to check if there is any common framework element
|
| 95 |
+
def has_common_framework(table1_framework, table2_framework):
|
| 96 |
+
table1_frameworks = set(table1_framework.split(', '))
|
| 97 |
+
table2_frameworks = set(table2_framework.split(', '))
|
| 98 |
+
return not table1_frameworks.isdisjoint(table2_frameworks)
|
| 99 |
+
|
| 100 |
+
# Replace similarity values with NaN where the frameworks match
|
| 101 |
+
for table1_id, table1_framework in table1_sel_map.items():
|
| 102 |
+
for table2_id in result_df.columns:
|
| 103 |
+
table2_framework = table2_sel_map.get(table2_id)
|
| 104 |
+
if pd.notna(table2_framework) and pd.notna(table1_framework):
|
| 105 |
+
if has_common_framework(table1_framework, table2_framework):
|
| 106 |
+
result_df.loc[table1_id, table2_id] = np.nan
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# Function to return the column names of the top 5 values for each row
|
| 111 |
+
def top_5_column(row):
|
| 112 |
+
# Find the top 5 values in the row
|
| 113 |
+
top_5_values = row.nlargest(5)
|
| 114 |
+
# Return the column names corresponding to these values
|
| 115 |
+
return top_5_values.index.tolist()
|
| 116 |
+
|
| 117 |
+
# Convert all columns to numeric data types, coercing non-convertible values to NaN
|
| 118 |
+
#result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')
|
| 119 |
+
|
| 120 |
+
# Get the list of non-numeric columns
|
| 121 |
+
#non_numeric_columns = result_df.columns[result_df.dtypes == 'object']
|
| 122 |
+
|
| 123 |
+
# Apply the function to each row of the DataFrame, excluding non-numeric columns
|
| 124 |
+
result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)
|
| 125 |
+
|
| 126 |
+
# Create a dictionary for fast lookup
|
| 127 |
+
id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))
|
| 128 |
+
|
| 129 |
+
# Function to map IDs to names
|
| 130 |
+
def map_ids_to_names(id_list):
|
| 131 |
+
return [id_to_name.get(id, "ID") for id in id_list]
|
| 132 |
+
|
| 133 |
+
# Apply the function to the 'Top 5 Column ID' column
|
| 134 |
+
result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)
|
| 135 |
+
|
| 136 |
+
# Ensure all entries are lists and have at least 5 elements, filling missing values with None
|
| 137 |
+
result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))
|
| 138 |
+
|
| 139 |
+
# Convert list in 'Top 5 Names' to separate columns
|
| 140 |
+
new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=["top1name", "top2name", "top3name", "top4name", "top5name"])
|
| 141 |
+
result_df = result_df.join(new_cols)
|
| 142 |
+
|
| 143 |
+
# Ensure all entries are lists and have exactly 5 elements, filling missing values with None
|
| 144 |
+
result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(
|
| 145 |
+
lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Convert list in 'Top 5 Column ID' to separate columns
|
| 149 |
+
new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=["top1id", "top2id", "top3id", "top4id", "top5id"])
|
| 150 |
+
result_df = result_df.join(new_ids_cols)
|
| 151 |
+
|
| 152 |
+
result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)
|
| 153 |
+
|
| 154 |
+
# Calculate min and max of the 'max_sim' column, ignoring NaN values
|
| 155 |
+
min_val = np.nanmin(result_df['max_sim'])
|
| 156 |
+
max_val = np.nanmax(result_df['max_sim'])
|
| 157 |
+
|
| 158 |
+
# Normalize the 'max_sim' values
|
| 159 |
+
result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)
|
| 160 |
+
|
| 161 |
+
result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',
|
| 162 |
+
'top2id', 'top3id', 'top4id', 'top5id']]
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# Merge the DataFrames
|
| 166 |
+
result_final = table1[['ID', 'Indicator Name', 'Framework']].merge(result_final, on='ID', how='left')
|
| 167 |
+
|
| 168 |
+
# Create a mapping from ID to Framework
|
| 169 |
+
id_to_framework = table2.set_index('ID')['Framework'].to_dict()
|
| 170 |
+
|
| 171 |
+
# Function to map ID to Framework
|
| 172 |
+
def map_framework(id):
|
| 173 |
+
return id_to_framework.get(id, np.nan)
|
| 174 |
+
|
| 175 |
+
# Add framework information for top1id to top5id
|
| 176 |
+
result_final['top1framework'] = result_final['top1id'].apply(map_framework)
|
| 177 |
+
result_final['top2framework'] = result_final['top2id'].apply(map_framework)
|
| 178 |
+
result_final['top3framework'] = result_final['top3id'].apply(map_framework)
|
| 179 |
+
result_final['top4framework'] = result_final['top4id'].apply(map_framework)
|
| 180 |
+
result_final['top5framework'] = result_final['top5id'].apply(map_framework)
|
| 181 |
+
|
| 182 |
+
return result_final
|
indicator_harmonizer.ipynb
CHANGED
|
@@ -21,18 +21,6 @@
|
|
| 21 |
"## 1 Load required packages"
|
| 22 |
]
|
| 23 |
},
|
| 24 |
-
{
|
| 25 |
-
"cell_type": "code",
|
| 26 |
-
"execution_count": 122,
|
| 27 |
-
"metadata": {},
|
| 28 |
-
"outputs": [],
|
| 29 |
-
"source": [
|
| 30 |
-
"#! pip install transformers\n",
|
| 31 |
-
"#! pip install torch\n",
|
| 32 |
-
"#! pip install scipy\n",
|
| 33 |
-
"#! pip install seaborn"
|
| 34 |
-
]
|
| 35 |
-
},
|
| 36 |
{
|
| 37 |
"cell_type": "code",
|
| 38 |
"execution_count": 1,
|
|
@@ -82,7 +70,16 @@
|
|
| 82 |
"metadata": {},
|
| 83 |
"outputs": [],
|
| 84 |
"source": [
|
| 85 |
-
"table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
]
|
| 87 |
},
|
| 88 |
{
|
|
@@ -126,7 +123,6 @@
|
|
| 126 |
"metadata": {},
|
| 127 |
"outputs": [],
|
| 128 |
"source": [
|
| 129 |
-
"table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
|
| 130 |
"table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
|
| 131 |
"\n",
|
| 132 |
"if 'Indicator ID' in table2.columns:\n",
|
|
@@ -143,195 +139,13 @@
|
|
| 143 |
"# 3. Compute the similarity between leonardo. indicator and the requested names"
|
| 144 |
]
|
| 145 |
},
|
| 146 |
-
{
|
| 147 |
-
"cell_type": "markdown",
|
| 148 |
-
"metadata": {},
|
| 149 |
-
"source": [
|
| 150 |
-
"### embeddings 1"
|
| 151 |
-
]
|
| 152 |
-
},
|
| 153 |
-
{
|
| 154 |
-
"cell_type": "code",
|
| 155 |
-
"execution_count": 6,
|
| 156 |
-
"metadata": {},
|
| 157 |
-
"outputs": [],
|
| 158 |
-
"source": [
|
| 159 |
-
"embeddings1 = f.get_embbedings(table1, 'concatenated_input')"
|
| 160 |
-
]
|
| 161 |
-
},
|
| 162 |
-
{
|
| 163 |
-
"cell_type": "markdown",
|
| 164 |
-
"metadata": {},
|
| 165 |
-
"source": [
|
| 166 |
-
"### embeddings 2"
|
| 167 |
-
]
|
| 168 |
-
},
|
| 169 |
-
{
|
| 170 |
-
"cell_type": "code",
|
| 171 |
-
"execution_count": 7,
|
| 172 |
-
"metadata": {},
|
| 173 |
-
"outputs": [],
|
| 174 |
-
"source": [
|
| 175 |
-
"embeddings2 = f.get_embbedings(table2,'concatenated_input')"
|
| 176 |
-
]
|
| 177 |
-
},
|
| 178 |
{
|
| 179 |
"cell_type": "code",
|
| 180 |
"execution_count": 8,
|
| 181 |
"metadata": {},
|
| 182 |
"outputs": [],
|
| 183 |
"source": [
|
| 184 |
-
"
|
| 185 |
-
"# Calculate cosine similarity between the embeddings\n",
|
| 186 |
-
"similarities = f.cos_sim(embeddings1, embeddings2)\n"
|
| 187 |
-
]
|
| 188 |
-
},
|
| 189 |
-
{
|
| 190 |
-
"cell_type": "code",
|
| 191 |
-
"execution_count": 30,
|
| 192 |
-
"metadata": {},
|
| 193 |
-
"outputs": [],
|
| 194 |
-
"source": [
|
| 195 |
-
"# Create a DataFrame for the similarities\n",
|
| 196 |
-
"result_df = pd.DataFrame(similarities, \n",
|
| 197 |
-
" columns=table2['ID'],\n",
|
| 198 |
-
" index=table1['ID'])\n",
|
| 199 |
-
"\n",
|
| 200 |
-
"# Mapping frameworks\n",
|
| 201 |
-
"table1_sel_map = table1.set_index('ID')['Framework'].to_dict()\n",
|
| 202 |
-
"table2_sel_map = table2.set_index('ID')['Framework'].to_dict()\n",
|
| 203 |
-
"\n",
|
| 204 |
-
"# Function to check if there is any common framework element\n",
|
| 205 |
-
"def has_common_framework(table1_framework, table2_framework):\n",
|
| 206 |
-
" table1_frameworks = set(table1_framework.split(', '))\n",
|
| 207 |
-
" table2_frameworks = set(table2_framework.split(', '))\n",
|
| 208 |
-
" return not table1_frameworks.isdisjoint(table2_frameworks)\n",
|
| 209 |
-
"\n",
|
| 210 |
-
"# Replace similarity values with NaN where the frameworks match\n",
|
| 211 |
-
"for table1_id, table1_framework in table1_sel_map.items():\n",
|
| 212 |
-
" for table2_id in result_df.columns:\n",
|
| 213 |
-
" table2_framework = table2_sel_map.get(table2_id)\n",
|
| 214 |
-
" if pd.notna(table2_framework) and pd.notna(table1_framework):\n",
|
| 215 |
-
" if has_common_framework(table1_framework, table2_framework):\n",
|
| 216 |
-
" result_df.loc[table1_id, table2_id] = np.nan\n",
|
| 217 |
-
"\n",
|
| 218 |
-
"\n",
|
| 219 |
-
"\n",
|
| 220 |
-
"# Function to return the column names of the top 5 values for each row\n",
|
| 221 |
-
"def top_5_column(row):\n",
|
| 222 |
-
" # Find the top 5 values in the row\n",
|
| 223 |
-
" top_5_values = row.nlargest(5)\n",
|
| 224 |
-
" # Return the column names corresponding to these values\n",
|
| 225 |
-
" return top_5_values.index.tolist()\n",
|
| 226 |
-
"\n",
|
| 227 |
-
"# Convert all columns to numeric data types, coercing non-convertible values to NaN\n",
|
| 228 |
-
"#result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')\n",
|
| 229 |
-
"\n",
|
| 230 |
-
"# Get the list of non-numeric columns\n",
|
| 231 |
-
"#non_numeric_columns = result_df.columns[result_df.dtypes == 'object']\n",
|
| 232 |
-
"\n",
|
| 233 |
-
"# Apply the function to each row of the DataFrame, excluding non-numeric columns\n",
|
| 234 |
-
"result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)\n",
|
| 235 |
-
"\n",
|
| 236 |
-
"# Create a dictionary for fast lookup\n",
|
| 237 |
-
"id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))\n",
|
| 238 |
-
"\n",
|
| 239 |
-
"# Function to map IDs to names\n",
|
| 240 |
-
"def map_ids_to_names(id_list):\n",
|
| 241 |
-
" return [id_to_name.get(id, \"ID\") for id in id_list]\n",
|
| 242 |
-
"\n",
|
| 243 |
-
"# Apply the function to the 'Top 5 Column ID' column\n",
|
| 244 |
-
"result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)\n",
|
| 245 |
-
"\n",
|
| 246 |
-
"# Ensure all entries are lists and have at least 5 elements, filling missing values with None\n",
|
| 247 |
-
"result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))\n",
|
| 248 |
-
"\n",
|
| 249 |
-
"# Convert list in 'Top 5 Names' to separate columns\n",
|
| 250 |
-
"new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=[\"top1name\", \"top2name\", \"top3name\", \"top4name\", \"top5name\"])\n",
|
| 251 |
-
"result_df = result_df.join(new_cols)\n",
|
| 252 |
-
"\n",
|
| 253 |
-
"# Ensure all entries are lists and have exactly 5 elements, filling missing values with None\n",
|
| 254 |
-
"result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(\n",
|
| 255 |
-
" lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)\n",
|
| 256 |
-
")\n",
|
| 257 |
-
"\n",
|
| 258 |
-
"# Convert list in 'Top 5 Column ID' to separate columns\n",
|
| 259 |
-
"new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=[\"top1id\", \"top2id\", \"top3id\", \"top4id\", \"top5id\"])\n",
|
| 260 |
-
"result_df = result_df.join(new_ids_cols)\n",
|
| 261 |
-
"\n"
|
| 262 |
-
]
|
| 263 |
-
},
|
| 264 |
-
{
|
| 265 |
-
"cell_type": "code",
|
| 266 |
-
"execution_count": 31,
|
| 267 |
-
"metadata": {},
|
| 268 |
-
"outputs": [],
|
| 269 |
-
"source": [
|
| 270 |
-
"result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)"
|
| 271 |
-
]
|
| 272 |
-
},
|
| 273 |
-
{
|
| 274 |
-
"cell_type": "code",
|
| 275 |
-
"execution_count": 32,
|
| 276 |
-
"metadata": {},
|
| 277 |
-
"outputs": [],
|
| 278 |
-
"source": [
|
| 279 |
-
"\n",
|
| 280 |
-
"# Calculate min and max of the 'max_sim' column, ignoring NaN values\n",
|
| 281 |
-
"min_val = np.nanmin(result_df['max_sim'])\n",
|
| 282 |
-
"max_val = np.nanmax(result_df['max_sim'])\n",
|
| 283 |
-
"\n",
|
| 284 |
-
"# Normalize the 'max_sim' values\n",
|
| 285 |
-
"result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)"
|
| 286 |
-
]
|
| 287 |
-
},
|
| 288 |
-
{
|
| 289 |
-
"cell_type": "markdown",
|
| 290 |
-
"metadata": {},
|
| 291 |
-
"source": [
|
| 292 |
-
"# 4 Asses the quality of the similarity, normalizing the similarity score"
|
| 293 |
-
]
|
| 294 |
-
},
|
| 295 |
-
{
|
| 296 |
-
"cell_type": "markdown",
|
| 297 |
-
"metadata": {},
|
| 298 |
-
"source": [
|
| 299 |
-
"Calculate wwhat the max similarity to identify how reliable the recommendation is and whether new indicators may be required"
|
| 300 |
-
]
|
| 301 |
-
},
|
| 302 |
-
{
|
| 303 |
-
"cell_type": "code",
|
| 304 |
-
"execution_count": 42,
|
| 305 |
-
"metadata": {},
|
| 306 |
-
"outputs": [],
|
| 307 |
-
"source": [
|
| 308 |
-
"result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',\n",
|
| 309 |
-
" 'top2id', 'top3id', 'top4id', 'top5id']]\n",
|
| 310 |
-
"\n",
|
| 311 |
-
"# Merge the required columns\n",
|
| 312 |
-
"result_final = table1[['Indicator ID', 'Indicator Name', 'Framework']].merge(result_final, \n",
|
| 313 |
-
" left_index=True, right_index=True, how='left')\n"
|
| 314 |
-
]
|
| 315 |
-
},
|
| 316 |
-
{
|
| 317 |
-
"cell_type": "code",
|
| 318 |
-
"execution_count": 43,
|
| 319 |
-
"metadata": {},
|
| 320 |
-
"outputs": [],
|
| 321 |
-
"source": [
|
| 322 |
-
"# Create a mapping from ID to Framework\n",
|
| 323 |
-
"id_to_framework = table2.set_index('ID')['Framework'].to_dict()\n",
|
| 324 |
-
"\n",
|
| 325 |
-
"# Function to map ID to Framework\n",
|
| 326 |
-
"def map_framework(id):\n",
|
| 327 |
-
" return id_to_framework.get(id, np.nan)\n",
|
| 328 |
-
"\n",
|
| 329 |
-
"# Add framework information for top1id to top5id\n",
|
| 330 |
-
"result_final['top1framework'] = result_final['top1id'].apply(map_framework)\n",
|
| 331 |
-
"result_final['top2framework'] = result_final['top2id'].apply(map_framework)\n",
|
| 332 |
-
"result_final['top3framework'] = result_final['top3id'].apply(map_framework)\n",
|
| 333 |
-
"result_final['top4framework'] = result_final['top4id'].apply(map_framework)\n",
|
| 334 |
-
"result_final['top5framework'] = result_final['top5id'].apply(map_framework)"
|
| 335 |
]
|
| 336 |
},
|
| 337 |
{
|
|
@@ -343,7 +157,7 @@
|
|
| 343 |
},
|
| 344 |
{
|
| 345 |
"cell_type": "code",
|
| 346 |
-
"execution_count":
|
| 347 |
"metadata": {},
|
| 348 |
"outputs": [],
|
| 349 |
"source": [
|
|
|
|
| 21 |
"## 1 Load required packages"
|
| 22 |
]
|
| 23 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
{
|
| 25 |
"cell_type": "code",
|
| 26 |
"execution_count": 1,
|
|
|
|
| 70 |
"metadata": {},
|
| 71 |
"outputs": [],
|
| 72 |
"source": [
|
| 73 |
+
"table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')\n",
|
| 74 |
+
"table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
|
| 75 |
+
"# columns to use for embeddings on table 1\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"columns_embeddings_col1 = ['Indicator Name']\n",
|
| 78 |
+
"\n",
|
| 79 |
+
"# columns to use for embeddings on table 2\n",
|
| 80 |
+
"columns_embeddings_col2 = ['Indicator name (leonardo)']\n",
|
| 81 |
+
"\n",
|
| 82 |
+
"harmonization=True"
|
| 83 |
]
|
| 84 |
},
|
| 85 |
{
|
|
|
|
| 123 |
"metadata": {},
|
| 124 |
"outputs": [],
|
| 125 |
"source": [
|
|
|
|
| 126 |
"table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
|
| 127 |
"\n",
|
| 128 |
"if 'Indicator ID' in table2.columns:\n",
|
|
|
|
| 139 |
"# 3. Compute the similarity between leonardo. indicator and the requested names"
|
| 140 |
]
|
| 141 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
{
|
| 143 |
"cell_type": "code",
|
| 144 |
"execution_count": 8,
|
| 145 |
"metadata": {},
|
| 146 |
"outputs": [],
|
| 147 |
"source": [
|
| 148 |
+
"result_final = f.process_similarity_results(table1, table2, harmonization=True)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
]
|
| 150 |
},
|
| 151 |
{
|
|
|
|
| 157 |
},
|
| 158 |
{
|
| 159 |
"cell_type": "code",
|
| 160 |
+
"execution_count": 9,
|
| 161 |
"metadata": {},
|
| 162 |
"outputs": [],
|
| 163 |
"source": [
|
requirements
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pandas
|
| 3 |
+
re
|
| 4 |
+
numpy
|
| 5 |
+
importlib
|
| 6 |
+
seatable_api
|
| 7 |
+
pandas
|
| 8 |
+
transformers
|
| 9 |
+
torch
|