Spaces:

leonardoimpact
/

IndicatorHarmonizer

Sleeping

App Files Files Community

afortuny commited on Oct 21, 2024

Commit

ccada5d

1 Parent(s): 266bd2c

create indicator recommend app

Browse files

Files changed (4) hide show

app.py +38 -0
functions.py +114 -5
indicator_harmonizer.ipynb +12 -198
requirements +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import gradio as gr
+import pandas as pd
+def process_tables(table1_path, table2_path, columns_embeddings_col1, columns_embeddings_col2, harmonization):
+    # Assuming process_similarity_results is a function defined in the uploaded functions.py
+    # that we've adapted based on previous discussions.
+    import functions
+    # Load tables
+    table1 = pd.read_csv(table1_path)
+    table2 = pd.read_csv(table2_path)
+    # You would include here the logic from your Jupyter notebook
+    # This should involve any preprocessing, calling the similarity function, etc.
+    result_final = functions.process_similarity_results(table1, table2, harmonization)
+    # Export to CSV
+    result_path = "/mnt/data/result_final.csv"
+    result_final.to_csv(result_path, index=False)
+    return result_path
+# Define Gradio interface
+iface = gr.Interface(
+    fn=process_tables,
+    inputs=[
+        gr.File(label="Upload Table 1 (Client Indicators or Framework Table)"),
+        gr.File(label="Upload Table 2 (Internal Indicator or Indicator Table)"),
+        gr.Textbox(label="Columns for Embeddings in Table 1", default="Indicator Name", placeholder="Enter column names separated by commas"),
+        gr.Textbox(label="Columns for Embeddings in Table 2", default="Indicator name (leonardo)", placeholder="Enter column names separated by commas"),
+        gr.Checkbox(label="Harmonization Mode", value=True)
+    ],
+    outputs=[
+        gr.File(label="Download Processed Results")
+    ],
+    description="Upload two tables and process them based on the selected parameters. If harmonization, set Table 1 as the Framework Table and Table 2 as the Indicator Table. Otherwise, set Table 1 as the Client Indicators Table and Table 2 as the Internal Indicator Table."
+)
+iface.launch()

functions.py CHANGED Viewed

@@ -9,6 +9,9 @@ from pandas import json_normalize
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 ### Parameters not expected to be changed in every run
@@ -20,11 +23,6 @@ columns_embeddings_col1 = ['Indicator Name']
 columns_embeddings_col2 = ['Indicator name (leonardo)']
-# ID column
-table1_id_col = ['ID']
-table2_id_col = ['ID']
 #### Functions
 from numpy.linalg import norm
@@ -71,3 +69,114 @@ def get_embbedings(table, colname):
     sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
     return sentence_embeddings

 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
+from seatable_api import Base, context
+from pandas import json_normalize
+import importlib
 ### Parameters not expected to be changed in every run
 columns_embeddings_col2 = ['Indicator name (leonardo)']
 #### Functions
 from numpy.linalg import norm
     sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
     return sentence_embeddings
+# Process similarity
+def process_similarity_results(table1, table2, harmonization=True):
+    embeddings1 = get_embbedings(table1, 'concatenated_input')
+    embeddings2 = get_embbedings(table2,'concatenated_input')
+    # Calculate cosine similarity between the embeddings
+    similarities = cos_sim(embeddings1, embeddings2)
+    # Create a DataFrame for the similarities
+    result_df = pd.DataFrame(similarities,
+                            columns=table2['ID'],
+                            index=table1['ID'])
+    if harmonization:
+        # Mapping frameworks
+        table1_sel_map = table1.set_index('ID')['Framework'].to_dict()
+        table2_sel_map = table2.set_index('ID')['Framework'].to_dict()
+        # Function to check if there is any common framework element
+        def has_common_framework(table1_framework, table2_framework):
+            table1_frameworks = set(table1_framework.split(', '))
+            table2_frameworks = set(table2_framework.split(', '))
+            return not table1_frameworks.isdisjoint(table2_frameworks)
+        # Replace similarity values with NaN where the frameworks match
+        for table1_id, table1_framework in table1_sel_map.items():
+            for table2_id in result_df.columns:
+                table2_framework = table2_sel_map.get(table2_id)
+                if pd.notna(table2_framework) and pd.notna(table1_framework):
+                    if has_common_framework(table1_framework, table2_framework):
+                        result_df.loc[table1_id, table2_id] = np.nan
+    # Function to return the column names of the top 5 values for each row
+    def top_5_column(row):
+        # Find the top 5 values in the row
+        top_5_values = row.nlargest(5)
+        # Return the column names corresponding to these values
+        return top_5_values.index.tolist()
+    # Convert all columns to numeric data types, coercing non-convertible values to NaN
+    #result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')
+    # Get the list of non-numeric columns
+    #non_numeric_columns = result_df.columns[result_df.dtypes == 'object']
+    # Apply the function to each row of the DataFrame, excluding non-numeric columns
+    result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)
+    # Create a dictionary for fast lookup
+    id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))
+    # Function to map IDs to names
+    def map_ids_to_names(id_list):
+        return [id_to_name.get(id, "ID") for id in id_list]
+    # Apply the function to the 'Top 5 Column ID' column
+    result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)
+    # Ensure all entries are lists and have at least 5 elements, filling missing values with None
+    result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))
+    # Convert list in 'Top 5 Names' to separate columns
+    new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=["top1name", "top2name", "top3name", "top4name", "top5name"])
+    result_df = result_df.join(new_cols)
+    # Ensure all entries are lists and have exactly 5 elements, filling missing values with None
+    result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(
+        lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)
+    )
+    # Convert list in 'Top 5 Column ID' to separate columns
+    new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=["top1id", "top2id", "top3id", "top4id", "top5id"])
+    result_df = result_df.join(new_ids_cols)
+    result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)
+    # Calculate min and max of the 'max_sim' column, ignoring NaN values
+    min_val = np.nanmin(result_df['max_sim'])
+    max_val = np.nanmax(result_df['max_sim'])
+    # Normalize the 'max_sim' values
+    result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)
+    result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',
+        'top2id', 'top3id', 'top4id', 'top5id']]
+    # Merge the DataFrames
+    result_final = table1[['ID', 'Indicator Name', 'Framework']].merge(result_final, on='ID', how='left')
+    # Create a mapping from ID to Framework
+    id_to_framework = table2.set_index('ID')['Framework'].to_dict()
+    # Function to map ID to Framework
+    def map_framework(id):
+        return id_to_framework.get(id, np.nan)
+    # Add framework information for top1id to top5id
+    result_final['top1framework'] = result_final['top1id'].apply(map_framework)
+    result_final['top2framework'] = result_final['top2id'].apply(map_framework)
+    result_final['top3framework'] = result_final['top3id'].apply(map_framework)
+    result_final['top4framework'] = result_final['top4id'].apply(map_framework)
+    result_final['top5framework'] = result_final['top5id'].apply(map_framework)
+    return result_final

indicator_harmonizer.ipynb CHANGED Viewed

@@ -21,18 +21,6 @@
     "## 1 Load required packages"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#! pip install transformers\n",
-    "#! pip install torch\n",
-    "#! pip install scipy\n",
-    "#! pip install seaborn"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 1,
@@ -82,7 +70,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')"
    ]
   },
   {
@@ -126,7 +123,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
     "table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
     "\n",
     "if 'Indicator ID' in table2.columns:\n",
@@ -143,195 +139,13 @@
     "# 3. Compute the similarity between leonardo. indicator and the requested names"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### embeddings 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings1 = f.get_embbedings(table1, 'concatenated_input')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### embeddings 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings2 = f.get_embbedings(table2,'concatenated_input')"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
-    "# Calculate cosine similarity between the embeddings\n",
-    "similarities = f.cos_sim(embeddings1, embeddings2)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a DataFrame for the similarities\n",
-    "result_df = pd.DataFrame(similarities, \n",
-    "                         columns=table2['ID'],\n",
-    "                         index=table1['ID'])\n",
-    "\n",
-    "# Mapping frameworks\n",
-    "table1_sel_map = table1.set_index('ID')['Framework'].to_dict()\n",
-    "table2_sel_map = table2.set_index('ID')['Framework'].to_dict()\n",
-    "\n",
-    "# Function to check if there is any common framework element\n",
-    "def has_common_framework(table1_framework, table2_framework):\n",
-    "    table1_frameworks = set(table1_framework.split(', '))\n",
-    "    table2_frameworks = set(table2_framework.split(', '))\n",
-    "    return not table1_frameworks.isdisjoint(table2_frameworks)\n",
-    "\n",
-    "# Replace similarity values with NaN where the frameworks match\n",
-    "for table1_id, table1_framework in table1_sel_map.items():\n",
-    "    for table2_id in result_df.columns:\n",
-    "        table2_framework = table2_sel_map.get(table2_id)\n",
-    "        if pd.notna(table2_framework) and pd.notna(table1_framework):\n",
-    "            if has_common_framework(table1_framework, table2_framework):\n",
-    "                result_df.loc[table1_id, table2_id] = np.nan\n",
-    "\n",
-    "\n",
-    "\n",
-    "# Function to return the column names of the top 5 values for each row\n",
-    "def top_5_column(row):\n",
-    "    # Find the top 5 values in the row\n",
-    "    top_5_values = row.nlargest(5)\n",
-    "    # Return the column names corresponding to these values\n",
-    "    return top_5_values.index.tolist()\n",
-    "\n",
-    "# Convert all columns to numeric data types, coercing non-convertible values to NaN\n",
-    "#result_df = result_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')\n",
-    "\n",
-    "# Get the list of non-numeric columns\n",
-    "#non_numeric_columns = result_df.columns[result_df.dtypes == 'object']\n",
-    "\n",
-    "# Apply the function to each row of the DataFrame, excluding non-numeric columns\n",
-    "result_df['Top 5 Column ID'] = result_df.apply(lambda row: top_5_column(row), axis=1)\n",
-    "\n",
-    "# Create a dictionary for fast lookup\n",
-    "id_to_name = dict(zip(table2['ID'], table2['Indicator name (leonardo)']))\n",
-    "\n",
-    "# Function to map IDs to names\n",
-    "def map_ids_to_names(id_list):\n",
-    "    return [id_to_name.get(id, \"ID\") for id in id_list]\n",
-    "\n",
-    "# Apply the function to the 'Top 5 Column ID' column\n",
-    "result_df['Top 5 Names'] = result_df['Top 5 Column ID'].apply(map_ids_to_names)\n",
-    "\n",
-    "# Ensure all entries are lists and have at least 5 elements, filling missing values with None\n",
-    "result_df['Top 5 Names'] = result_df['Top 5 Names'].apply(lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5))\n",
-    "\n",
-    "# Convert list in 'Top 5 Names' to separate columns\n",
-    "new_cols = pd.DataFrame(result_df['Top 5 Names'].tolist(), index=result_df.index, columns=[\"top1name\", \"top2name\", \"top3name\", \"top4name\", \"top5name\"])\n",
-    "result_df = result_df.join(new_cols)\n",
-    "\n",
-    "# Ensure all entries are lists and have exactly 5 elements, filling missing values with None\n",
-    "result_df['Top 5 Column ID'] = result_df['Top 5 Column ID'].apply(\n",
-    "    lambda x: x if isinstance(x, list) and len(x) >= 5 else (x + [None] * (5 - len(x)) if isinstance(x, list) else [None]*5)\n",
-    ")\n",
-    "\n",
-    "# Convert list in 'Top 5 Column ID' to separate columns\n",
-    "new_ids_cols = pd.DataFrame(result_df['Top 5 Column ID'].tolist(), index=result_df.index, columns=[\"top1id\", \"top2id\", \"top3id\", \"top4id\", \"top5id\"])\n",
-    "result_df = result_df.join(new_ids_cols)\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result_df['max_sim'] = np.nanmax(result_df[table2['ID']], axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# Calculate min and max of the 'max_sim' column, ignoring NaN values\n",
-    "min_val = np.nanmin(result_df['max_sim'])\n",
-    "max_val = np.nanmax(result_df['max_sim'])\n",
-    "\n",
-    "# Normalize the 'max_sim' values\n",
-    "result_df['max_sim_normalized'] = (result_df['max_sim'] - min_val) / (max_val - min_val)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# 4 Asses the quality of the similarity, normalizing the similarity score"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Calculate wwhat the max similarity to identify how reliable the recommendation is and whether new indicators may be required"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "result_final = result_df[['max_sim_normalized','top1name', 'top2name', 'top3name', 'top4name', 'top5name', 'top1id',\n",
-    "       'top2id', 'top3id', 'top4id', 'top5id']]\n",
-    "\n",
-    "# Merge the required columns\n",
-    "result_final = table1[['Indicator ID', 'Indicator Name', 'Framework']].merge(result_final, \n",
-    "                                  left_index=True, right_index=True, how='left')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create a mapping from ID to Framework\n",
-    "id_to_framework = table2.set_index('ID')['Framework'].to_dict()\n",
-    "\n",
-    "# Function to map ID to Framework\n",
-    "def map_framework(id):\n",
-    "    return id_to_framework.get(id, np.nan)\n",
-    "\n",
-    "# Add framework information for top1id to top5id\n",
-    "result_final['top1framework'] = result_final['top1id'].apply(map_framework)\n",
-    "result_final['top2framework'] = result_final['top2id'].apply(map_framework)\n",
-    "result_final['top3framework'] = result_final['top3id'].apply(map_framework)\n",
-    "result_final['top4framework'] = result_final['top4id'].apply(map_framework)\n",
-    "result_final['top5framework'] = result_final['top5id'].apply(map_framework)"
    ]
   },
   {
@@ -343,7 +157,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
    "metadata": {},
    "outputs": [],
    "source": [

     "## 1 Load required packages"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
+    "table1 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Frameworks_Default View(1).xlsx')\n",
+    "table2 = pd.read_excel('/Users/alanfortunysicart/Downloads/Indicators_Indicators_Default view(14).xlsx')\n",
+    "# columns to use for embeddings on table 1\n",
+    "\n",
+    "columns_embeddings_col1 = ['Indicator Name']\n",
+    "\n",
+    "# columns to use for embeddings on table 2\n",
+    "columns_embeddings_col2 = ['Indicator name (leonardo)']\n",
+    "\n",
+    "harmonization=True"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
     "table2 = f.concatenate_columns(table2,columns=f.columns_embeddings_col2)\n",
     "\n",
     "if 'Indicator ID' in table2.columns:\n",
     "# 3. Compute the similarity between leonardo. indicator and the requested names"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
+    "result_final = f.process_similarity_results(table1, table2, harmonization=True)"
    ]
   },
   {
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [

requirements ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+pandas
+re
+numpy
+importlib
+seatable_api
+pandas
+transformers
+torch