afortuny commited on
Commit
ccada5d
·
1 Parent(s): 266bd2c

create indicator recommend app

Browse files
Files changed (4) hide show
  1. app.py +38 -0
  2. functions.py +114 -5
  3. indicator_harmonizer.ipynb +12 -198
  4. requirements +9 -0
app.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ def process_tables(table1_path, table2_path, columns_embeddings_col1, columns_embeddings_col2, harmonization):
5
+ # Assuming process_similarity_results is a function defined in the uploaded functions.py
6
+ # that we've adapted based on previous discussions.
7
+ import functions
8
+
9
+ # Load tables
10
+ table1 = pd.read_csv(table1_path)
11
+ table2 = pd.read_csv(table2_path)
12
+
13
+ # You would include here the logic from your Jupyter notebook
14
+ # This should involve any preprocessing, calling the similarity function, etc.
15
+ result_final = functions.process_similarity_results(table1, table2, harmonization)
16
+
17
+ # Export to CSV
18
+ result_path = "/mnt/data/result_final.csv"
19
+ result_final.to_csv(result_path, index=False)
20
+ return result_path
21
+
22
+ # Define Gradio interface
23
+ iface = gr.Interface(
24
+ fn=process_tables,
25
+ inputs=[
26
+ gr.File(label="Upload Table 1 (Client Indicators or Framework Table)"),
27
+ gr.File(label="Upload Table 2 (Internal Indicator or Indicator Table)"),
28
+ gr.Textbox(label="Columns for Embeddings in Table 1", default="Indicator Name", placeholder="Enter column names separated by commas"),
29
+ gr.Textbox(label="Columns for Embeddings in Table 2", default="Indicator name (leonardo)", placeholder="Enter column names separated by commas"),
30
+ gr.Checkbox(label="Harmonization Mode", value=True)
31
+ ],
32
+ outputs=[
33
+ gr.File(label="Download Processed Results")
34
+ ],
35
+ description="Upload two tables and process them based on the selected parameters. If harmonization, set Table 1 as the Framework Table and Table 2 as the Indicator Table. Otherwise, set Table 1 as the Client Indicators Table and Table 2 as the Internal Indicator Table."
36
+ )
37
+
38
+ iface.launch()
functions.py CHANGED
@@ -9,6 +9,9 @@ from pandas import json_normalize
9
  from transformers import AutoTokenizer, AutoModel
10
  import torch
11
  import torch.nn.functional as F
 
 
 
12
 
13
  ### Parameters not expected to be changed in every run
14
 
@@ -20,11 +23,6 @@ columns_embeddings_col1 = ['Indicator Name']
20
  columns_embeddings_col2 = ['Indicator name (leonardo)']
21
 
22
 
23
- # ID column
24
-
25
- table1_id_col = ['ID']
26
- table2_id_col = ['ID']
27
-
28
  #### Functions
29
 
30
  from numpy.linalg import norm
@@ -71,3 +69,114 @@ def get_embbedings(table, colname):
71
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
72
 
73
  return sentence_embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  from transformers import AutoTokenizer, AutoModel
10
  import torch
11
  import torch.nn.functional as F
12
+ from seatable_api import Base, context
13
+ from pandas import json_normalize
14
+ import importlib
15
 
16
  ### Parameters not expected to be changed in every run
17
 
 
23
  columns_embeddings_col2 = ['Indicator name (leonardo)']
24
 
25
 
 
 
 
 
 
26
  #### Functions
27
 
28
  from numpy.linalg import norm
 
69
  sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
70
 
71
  return sentence_embeddings
72
+
73
+
74
+ # Process similarity
75
+
76
+ def process_similarity_results(table1, table2, harmonization=True):
77
+
78
+ embeddings1 = get_embbedings(table1, 'concatenated_input')
79
+ embeddings2 = get_embbedings(table2,'concatenated_input')
80
+
81
+ # Calculate cosine similarity between the embeddings
82
+ similarities = cos_sim(embeddings1, embeddings2)
83
+
84
+ # Create a DataFrame for the similarities
85
+ result_df = pd.DataFrame(similarities,
86
+ columns=table2['ID'],
87
+ index=table1['ID'])
88
+ if harmonization:
89
+
90
+ # Mapping frameworks
91
+ table1_sel_map = table1.set_index('ID')['Framework'].to_dict()
92
+ table2_sel_map = table2.set_index('ID')['Framework'].to_dict()
93
+
94
+ # Function to check if there is any common framework element
95
+ def has_common_framework(table1_framework, table2_framework):
96
+ table1_frameworks = set(table1_framework.split(', '))
97
+ table2_frameworks = set(table2_framework.split(', '))
98
+ return not table1_frameworks.isdisjoint(table2_frameworks)
99
+
100
+ # Replace similarity values with NaN where the frameworks match
101
+ for table1_id, table1_framework in table1_sel_map.items():
102
+ for table2_id in result_df.columns:
103
+ table2_framework = table2_sel_map.get(table2_id)
104
+ if pd.notna(table2_framework) and pd.notna(table1_framework):
105
+ if has_common_framework(table1_framework, table2_framework):
106
+ result_df.loc[table1_id, table2_id] = np.nan
107
+
108
+
109
+
110
+ # Function to return the column names of the top 5 values for each row
111
+ def top_5_column(row):
112
+ # Find the top 5 values in the row
113
+ top_5_values = row.nlargest(5)
114
+ # Return the column names corresponding to these values
115
+ return top_5_values.index.tolist()
116
+
117
+ # Convert all columns to numeric data types, coercing non-convertible values to NaN
118
+ #result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')
119
+
120
+ # Get the list of non-numeric columns
121
+ #non_numeric_columns = result_df.columns[result_df.dtypes == 'object']
122
+
123
+ # Apply the function to each row of the DataFrame, excluding non-numeric columns
124
+ result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)
125
+
126
+ # Create a dictionary for fast lookup
127
+ id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))
128
+
129
+ # Function to map IDs to names
130
+ def map_ids_to_names(id_list):
131
+ return [id_to_name.get(id, "ID") for id in id_list]
132
+
133
+ # Apply the function to the 'Top 5 Column ID' column
134
+ result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)
135
+
136
+ # Ensure all entries are lists and have at least 5 elements, filling missing values with None
137
+ result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))
138
+
139
+ # Convert list in 'Top 5 Names' to separate columns
140
+ new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=["top1name", "top2name", "top3name", "top4name", "top5name"])
141
+ result_df = result_df.join(new_cols)
142
+
143
+ # Ensure all entries are lists and have exactly 5 elements, filling missing values with None
144
+ result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(
145
+ lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)
146
+ )
147
+
148
+ # Convert list in 'Top 5 Column ID' to separate columns
149
+ new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=["top1id", "top2id", "top3id", "top4id", "top5id"])
150
+ result_df = result_df.join(new_ids_cols)
151
+
152
+ result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)
153
+
154
+ # Calculate min and max of the 'max_sim' column, ignoring NaN values
155
+ min_val = np.nanmin(result_df['max_sim'])
156
+ max_val = np.nanmax(result_df['max_sim'])
157
+
158
+ # Normalize the 'max_sim' values
159
+ result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)
160
+
161
+ result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',
162
+ 'top2id', 'top3id', 'top4id', 'top5id']]
163
+
164
+
165
+ # Merge the DataFrames
166
+ result_final = table1[['ID', 'Indicator Name', 'Framework']].merge(result_final, on='ID', how='left')
167
+
168
+ # Create a mapping from ID to Framework
169
+ id_to_framework = table2.set_index('ID')['Framework'].to_dict()
170
+
171
+ # Function to map ID to Framework
172
+ def map_framework(id):
173
+ return id_to_framework.get(id, np.nan)
174
+
175
+ # Add framework information for top1id to top5id
176
+ result_final['top1framework'] = result_final['top1id'].apply(map_framework)
177
+ result_final['top2framework'] = result_final['top2id'].apply(map_framework)
178
+ result_final['top3framework'] = result_final['top3id'].apply(map_framework)
179
+ result_final['top4framework'] = result_final['top4id'].apply(map_framework)
180
+ result_final['top5framework'] = result_final['top5id'].apply(map_framework)
181
+
182
+ return result_final
indicator_harmonizer.ipynb CHANGED
@@ -21,18 +21,6 @@
21
  "## 1 Load required packages"
22
  ]
23
  },
24
- {
25
- "cell_type": "code",
26
- "execution_count": 122,
27
- "metadata": {},
28
- "outputs": [],
29
- "source": [
30
- "#! pip install transformers\n",
31
- "#! pip install torch\n",
32
- "#! pip install scipy\n",
33
- "#! pip install seaborn"
34
- ]
35
- },
36
  {
37
  "cell_type": "code",
38
  "execution_count": 1,
@@ -82,7 +70,16 @@
82
  "metadata": {},
83
  "outputs": [],
84
  "source": [
85
- "table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')"
 
 
 
 
 
 
 
 
 
86
  ]
87
  },
88
  {
@@ -126,7 +123,6 @@
126
  "metadata": {},
127
  "outputs": [],
128
  "source": [
129
- "table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
130
  "table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
131
  "\n",
132
  "if 'Indicator ID' in table2.columns:\n",
@@ -143,195 +139,13 @@
143
  "# 3. Compute the similarity between leonardo. indicator and the requested names"
144
  ]
145
  },
146
- {
147
- "cell_type": "markdown",
148
- "metadata": {},
149
- "source": [
150
- "### embeddings 1"
151
- ]
152
- },
153
- {
154
- "cell_type": "code",
155
- "execution_count": 6,
156
- "metadata": {},
157
- "outputs": [],
158
- "source": [
159
- "embeddings1 = f.get_embbedings(table1, 'concatenated_input')"
160
- ]
161
- },
162
- {
163
- "cell_type": "markdown",
164
- "metadata": {},
165
- "source": [
166
- "### embeddings 2"
167
- ]
168
- },
169
- {
170
- "cell_type": "code",
171
- "execution_count": 7,
172
- "metadata": {},
173
- "outputs": [],
174
- "source": [
175
- "embeddings2 = f.get_embbedings(table2,'concatenated_input')"
176
- ]
177
- },
178
  {
179
  "cell_type": "code",
180
  "execution_count": 8,
181
  "metadata": {},
182
  "outputs": [],
183
  "source": [
184
- "\n",
185
- "# Calculate cosine similarity between the embeddings\n",
186
- "similarities = f.cos_sim(embeddings1, embeddings2)\n"
187
- ]
188
- },
189
- {
190
- "cell_type": "code",
191
- "execution_count": 30,
192
- "metadata": {},
193
- "outputs": [],
194
- "source": [
195
- "# Create a DataFrame for the similarities\n",
196
- "result_df = pd.DataFrame(similarities, \n",
197
- " columns=table2['ID'],\n",
198
- " index=table1['ID'])\n",
199
- "\n",
200
- "# Mapping frameworks\n",
201
- "table1_sel_map = table1.set_index('ID')['Framework'].to_dict()\n",
202
- "table2_sel_map = table2.set_index('ID')['Framework'].to_dict()\n",
203
- "\n",
204
- "# Function to check if there is any common framework element\n",
205
- "def has_common_framework(table1_framework, table2_framework):\n",
206
- " table1_frameworks = set(table1_framework.split(', '))\n",
207
- " table2_frameworks = set(table2_framework.split(', '))\n",
208
- " return not table1_frameworks.isdisjoint(table2_frameworks)\n",
209
- "\n",
210
- "# Replace similarity values with NaN where the frameworks match\n",
211
- "for table1_id, table1_framework in table1_sel_map.items():\n",
212
- " for table2_id in result_df.columns:\n",
213
- " table2_framework = table2_sel_map.get(table2_id)\n",
214
- " if pd.notna(table2_framework) and pd.notna(table1_framework):\n",
215
- " if has_common_framework(table1_framework, table2_framework):\n",
216
- " result_df.loc[table1_id, table2_id] = np.nan\n",
217
- "\n",
218
- "\n",
219
- "\n",
220
- "# Function to return the column names of the top 5 values for each row\n",
221
- "def top_5_column(row):\n",
222
- " # Find the top 5 values in the row\n",
223
- " top_5_values = row.nlargest(5)\n",
224
- " # Return the column names corresponding to these values\n",
225
- " return top_5_values.index.tolist()\n",
226
- "\n",
227
- "# Convert all columns to numeric data types, coercing non-convertible values to NaN\n",
228
- "#result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')\n",
229
- "\n",
230
- "# Get the list of non-numeric columns\n",
231
- "#non_numeric_columns = result_df.columns[result_df.dtypes == 'object']\n",
232
- "\n",
233
- "# Apply the function to each row of the DataFrame, excluding non-numeric columns\n",
234
- "result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)\n",
235
- "\n",
236
- "# Create a dictionary for fast lookup\n",
237
- "id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))\n",
238
- "\n",
239
- "# Function to map IDs to names\n",
240
- "def map_ids_to_names(id_list):\n",
241
- " return [id_to_name.get(id, \"ID\") for id in id_list]\n",
242
- "\n",
243
- "# Apply the function to the 'Top 5 Column ID' column\n",
244
- "result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)\n",
245
- "\n",
246
- "# Ensure all entries are lists and have at least 5 elements, filling missing values with None\n",
247
- "result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))\n",
248
- "\n",
249
- "# Convert list in 'Top 5 Names' to separate columns\n",
250
- "new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=[\"top1name\", \"top2name\", \"top3name\", \"top4name\", \"top5name\"])\n",
251
- "result_df = result_df.join(new_cols)\n",
252
- "\n",
253
- "# Ensure all entries are lists and have exactly 5 elements, filling missing values with None\n",
254
- "result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(\n",
255
- " lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)\n",
256
- ")\n",
257
- "\n",
258
- "# Convert list in 'Top 5 Column ID' to separate columns\n",
259
- "new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=[\"top1id\", \"top2id\", \"top3id\", \"top4id\", \"top5id\"])\n",
260
- "result_df = result_df.join(new_ids_cols)\n",
261
- "\n"
262
- ]
263
- },
264
- {
265
- "cell_type": "code",
266
- "execution_count": 31,
267
- "metadata": {},
268
- "outputs": [],
269
- "source": [
270
- "result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)"
271
- ]
272
- },
273
- {
274
- "cell_type": "code",
275
- "execution_count": 32,
276
- "metadata": {},
277
- "outputs": [],
278
- "source": [
279
- "\n",
280
- "# Calculate min and max of the 'max_sim' column, ignoring NaN values\n",
281
- "min_val = np.nanmin(result_df['max_sim'])\n",
282
- "max_val = np.nanmax(result_df['max_sim'])\n",
283
- "\n",
284
- "# Normalize the 'max_sim' values\n",
285
- "result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)"
286
- ]
287
- },
288
- {
289
- "cell_type": "markdown",
290
- "metadata": {},
291
- "source": [
292
- "# 4 Asses the quality of the similarity, normalizing the similarity score"
293
- ]
294
- },
295
- {
296
- "cell_type": "markdown",
297
- "metadata": {},
298
- "source": [
299
- "Calculate wwhat the max similarity to identify how reliable the recommendation is and whether new indicators may be required"
300
- ]
301
- },
302
- {
303
- "cell_type": "code",
304
- "execution_count": 42,
305
- "metadata": {},
306
- "outputs": [],
307
- "source": [
308
- "result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',\n",
309
- " 'top2id', 'top3id', 'top4id', 'top5id']]\n",
310
- "\n",
311
- "# Merge the required columns\n",
312
- "result_final = table1[['Indicator ID', 'Indicator Name', 'Framework']].merge(result_final, \n",
313
- " left_index=True, right_index=True, how='left')\n"
314
- ]
315
- },
316
- {
317
- "cell_type": "code",
318
- "execution_count": 43,
319
- "metadata": {},
320
- "outputs": [],
321
- "source": [
322
- "# Create a mapping from ID to Framework\n",
323
- "id_to_framework = table2.set_index('ID')['Framework'].to_dict()\n",
324
- "\n",
325
- "# Function to map ID to Framework\n",
326
- "def map_framework(id):\n",
327
- " return id_to_framework.get(id, np.nan)\n",
328
- "\n",
329
- "# Add framework information for top1id to top5id\n",
330
- "result_final['top1framework'] = result_final['top1id'].apply(map_framework)\n",
331
- "result_final['top2framework'] = result_final['top2id'].apply(map_framework)\n",
332
- "result_final['top3framework'] = result_final['top3id'].apply(map_framework)\n",
333
- "result_final['top4framework'] = result_final['top4id'].apply(map_framework)\n",
334
- "result_final['top5framework'] = result_final['top5id'].apply(map_framework)"
335
  ]
336
  },
337
  {
@@ -343,7 +157,7 @@
343
  },
344
  {
345
  "cell_type": "code",
346
- "execution_count": 142,
347
  "metadata": {},
348
  "outputs": [],
349
  "source": [
 
21
  "## 1 Load required packages"
22
  ]
23
  },
 
 
 
 
 
 
 
 
 
 
 
 
24
  {
25
  "cell_type": "code",
26
  "execution_count": 1,
 
70
  "metadata": {},
71
  "outputs": [],
72
  "source": [
73
+ "table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')\n",
74
+ "table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
75
+ "# columns to use for embeddings on table 1\n",
76
+ "\n",
77
+ "columns_embeddings_col1 = ['Indicator Name']\n",
78
+ "\n",
79
+ "# columns to use for embeddings on table 2\n",
80
+ "columns_embeddings_col2 = ['Indicator name (leonardo)']\n",
81
+ "\n",
82
+ "harmonization=True"
83
  ]
84
  },
85
  {
 
123
  "metadata": {},
124
  "outputs": [],
125
  "source": [
 
126
  "table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
127
  "\n",
128
  "if 'Indicator ID' in table2.columns:\n",
 
139
  "# 3. Compute the similarity between leonardo. indicator and the requested names"
140
  ]
141
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  {
143
  "cell_type": "code",
144
  "execution_count": 8,
145
  "metadata": {},
146
  "outputs": [],
147
  "source": [
148
+ "result_final = f.process_similarity_results(table1, table2, harmonization=True)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  ]
150
  },
151
  {
 
157
  },
158
  {
159
  "cell_type": "code",
160
+ "execution_count": 9,
161
  "metadata": {},
162
  "outputs": [],
163
  "source": [
requirements ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ re
4
+ numpy
5
+ importlib
6
+ seatable_api
7
+ pandas
8
+ transformers
9
+ torch