avantol commited on
Commit
0a4b5f6
·
1 Parent(s): 5c660a7

feat(notebook): initial attempt to add notebook

Browse files
Files changed (2) hide show
  1. ai_assisted_data_curation.ipynb +338 -0
  2. app.py +4 -4
ai_assisted_data_curation.ipynb ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "0",
6
+ "metadata": {},
7
+ "source": [
8
+ "# AI-Assisted Data Curation Toolkit"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "1",
14
+ "metadata": {},
15
+ "source": [
16
+ "This notebook demonstrates the AI-Assisted Data Curation Toolkit. It is capable of suggesting harmonizations from a source data model into a target data model using AI-backed approaches, but leaving the expert curator in complete control."
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "e9f03bd7",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "%pip install ai_harmonization"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "id": "2",
32
+ "metadata": {},
33
+ "source": [
34
+ "## Setup"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "id": "4",
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "import os\n",
45
+ "import json\n",
46
+ "\n",
47
+ "from ai_harmonization.interactive import (\n",
48
+ " get_interactive_table_for_suggestions,\n",
49
+ " get_nodes_and_properties_df,\n",
50
+ ")\n",
51
+ "from ai_harmonization.simple_data_model import (\n",
52
+ " SimpleDataModel,\n",
53
+ " get_data_model_as_node_prop_type_descriptions,\n",
54
+ ")\n",
55
+ "from ai_harmonization.harmonization_approaches.similarity_inmem import (\n",
56
+ " SimilaritySearchInMemoryVectorDb,\n",
57
+ ")\n",
58
+ "from ai_harmonization.harmonization_approaches.embeddings import BGEEmbeddings"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "markdown",
63
+ "id": "5",
64
+ "metadata": {},
65
+ "source": [
66
+ "Set available GPUs (skip this step is using CPUs)"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "id": "6",
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1,2,3\" # change as necessary"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "id": "7",
82
+ "metadata": {},
83
+ "source": [
84
+ "## Use a Harmonization Approach to get Suggestions"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "markdown",
89
+ "id": "8",
90
+ "metadata": {},
91
+ "source": [
92
+ "### Get Input Data\n",
93
+ "\n",
94
+ "- A `source data model` you want to harmonize from\n",
95
+ "- A `target data model` you want to harmonize to\n",
96
+ "\n",
97
+ "For this initial example, you can just using hard-coded examples.\n",
98
+ "\n",
99
+ "- The `example_synthetic_source_model.json` is a synthetically generated model for example purposes\n",
100
+ "- The `example_real_source_model.json` is a real original study before ingestion into the NHLBI BioData Catalyst ecosystem (e.g. not yet harmonized)\n",
101
+ "- The `target data model` example is the **NHLBI BioData Catalyst Gen3 Data Dictionary v4.6.5** (latest version as of 21 AUG 2025)\n",
102
+ "\n",
103
+ "You can change this to supply your own source model, so long as the format follows the example. Similarly for target model. The source model will eventually come from a connection to a previously released AI-backed tool for Schema Generation, allowing this entire flow to start from arbitrary TSVs."
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "9",
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "# source_file = \"./examples/example_synthetic_source_model.json\"\n",
114
+ "source_file = \"./examples/example_real_source_model.json\"\n",
115
+ "\n",
116
+ "target_file = \"./examples/example_target_model_BDC.json\"\n",
117
+ "\n",
118
+ "with open(source_file, \"r\") as f:\n",
119
+ " input_source_model = json.load(f)\n",
120
+ "\n",
121
+ "input_source_model = SimpleDataModel.get_from_unknown_json_format(\n",
122
+ " json.dumps(input_source_model)\n",
123
+ ")\n",
124
+ "\n",
125
+ "with open(target_file, \"r\") as f:\n",
126
+ " input_target_model = json.load(f)\n",
127
+ "\n",
128
+ "input_target_model = SimpleDataModel.get_from_unknown_json_format(\n",
129
+ " json.dumps(input_target_model)\n",
130
+ ")"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "id": "10",
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "print(\"Source Model\")\n",
141
+ "input_source_model.get_property_df()"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "11",
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "print(\"Target Model\")\n",
152
+ "input_target_model.get_property_df()"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "id": "12",
158
+ "metadata": {},
159
+ "source": [
160
+ "### Use a Specific Harmonization Approach to get Suggestions"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "13",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "embedding_fn = BGEEmbeddings(model_name=\"BAAI/bge-large-en-v1.5\")\n",
171
+ "batch_size = 32\n",
172
+ "\n",
173
+ "harmonization_approach = SimilaritySearchInMemoryVectorDb(\n",
174
+ " # A unique name for this file and embedding algorithm within the limits of the length required by the in-memory vectostore\n",
175
+ " vectordb_persist_directory_name=f\"{os.path.basename(target_file)[:53]}-{embedding_fn.model.name_or_path.split(\"/\")[-1][:5]}-0\",\n",
176
+ " input_target_model=input_target_model,\n",
177
+ " embedding_function=embedding_fn,\n",
178
+ " batch_size=batch_size,\n",
179
+ ")\n",
180
+ "\n",
181
+ "max_suggestions_per_property = 10\n",
182
+ "# set threshold low to just get top properties no matter what\n",
183
+ "score_threshold = 0\n",
184
+ "\n",
185
+ "suggestions = harmonization_approach.get_harmonization_suggestions(\n",
186
+ " input_source_model=input_source_model,\n",
187
+ " input_target_model=input_target_model,\n",
188
+ " score_threshold=score_threshold,\n",
189
+ " k=max_suggestions_per_property,\n",
190
+ ")"
191
+ ]
192
+ },
193
+ {
194
+ "cell_type": "markdown",
195
+ "id": "14",
196
+ "metadata": {},
197
+ "source": [
198
+ "### Visualize Suggestions"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "id": "15",
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": [
208
+ "table_df = suggestions.to_simlified_dataframe()\n",
209
+ "table_df.sort_values(by=\"Similarity\", ascending=False, inplace=True)\n",
210
+ "table_df"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": null,
216
+ "id": "16",
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "# Group by 'Original Node.Property' and find the index of max similarity for each group\n",
221
+ "idx = table_df.groupby(\"Original Node.Property\")[\"Similarity\"].idxmax()\n",
222
+ "\n",
223
+ "# Filter DataFrame using the indices found above\n",
224
+ "filtered_df = table_df.loc[idx]\n",
225
+ "filtered_df.drop(columns=[\"Original Description\", \"Target Description\"], inplace=True)\n",
226
+ "filtered_df.sort_values(by=\"Similarity\", ascending=False, inplace=True)\n",
227
+ "filtered_df"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "markdown",
232
+ "id": "17",
233
+ "metadata": {},
234
+ "source": [
235
+ "### Create Interactive Table for Selecting Suggestions"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "18",
242
+ "metadata": {},
243
+ "outputs": [],
244
+ "source": [
245
+ "table = get_interactive_table_for_suggestions(\n",
246
+ " table_df,\n",
247
+ " column_for_filtering=1,\n",
248
+ " # additional config for the interactive table\n",
249
+ " maxBytes=\"2MB\",\n",
250
+ " pageLength=50,\n",
251
+ ")\n",
252
+ "table"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "markdown",
257
+ "id": "19",
258
+ "metadata": {},
259
+ "source": [
260
+ "> **Don't see the table or see an error above?** Try restarting the kernel, then try restarting jupyter lab (if that's what you're using). The installs for AnyWidgets might not be picked up yet.\n",
261
+ "\n",
262
+ "> **Dark Theme?** If you're using a dark theme, you might need to switch to light for the table to display properly. \n",
263
+ "\n",
264
+ "> **Using VS Code Jupyter Extension?** Links might not work"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "markdown",
269
+ "id": "20",
270
+ "metadata": {},
271
+ "source": [
272
+ "To use the selections above, record them below in `manual_selection_indexes` or ue multi-select in the above table and the below will automatically use those. "
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": null,
278
+ "id": "21",
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "# Fill this out manually as you go, or we'll use the table selections\n",
283
+ "manual_selection_indexes = [] # [1, 8, 24, ...]\n",
284
+ "\n",
285
+ "selected_rows = manual_selection_indexes or table.selected_rows\n",
286
+ "\n",
287
+ "print(f\"Selected Suggestions: {selected_rows}\")"
288
+ ]
289
+ },
290
+ {
291
+ "cell_type": "code",
292
+ "execution_count": null,
293
+ "id": "22",
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "table_df.loc[selected_rows]"
298
+ ]
299
+ },
300
+ {
301
+ "cell_type": "code",
302
+ "execution_count": null,
303
+ "id": "23",
304
+ "metadata": {},
305
+ "outputs": [],
306
+ "source": [
307
+ "table_df.loc[selected_rows].to_csv(\n",
308
+ " \"./selected_suggestions.tsv\",\n",
309
+ " index=False,\n",
310
+ " na_rep=\"N/A\",\n",
311
+ " sep=\"\\t\",\n",
312
+ " quotechar='\"',\n",
313
+ ")"
314
+ ]
315
+ }
316
+ ],
317
+ "metadata": {
318
+ "kernelspec": {
319
+ "display_name": "ai-harmonization (3.13.5)",
320
+ "language": "python",
321
+ "name": "python3"
322
+ },
323
+ "language_info": {
324
+ "codemirror_mode": {
325
+ "name": "ipython",
326
+ "version": 3
327
+ },
328
+ "file_extension": ".py",
329
+ "mimetype": "text/x-python",
330
+ "name": "python",
331
+ "nbconvert_exporter": "python",
332
+ "pygments_lexer": "ipython3",
333
+ "version": "3.13.5"
334
+ }
335
+ },
336
+ "nbformat": 4,
337
+ "nbformat_minor": 5
338
+ }
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
  import gradio as gr
2
 
3
+ def show_link():
4
+ return "Check out the Jupyter notebook demo: https://huggingface.co/spaces/uc-ctds/ai_assisted_data_curation_toolkit/blob/main/ai_assisted_data_curation.ipynb"
5
 
6
+ interface = gr.Interface(fn=show_link, inputs=None, outputs="text")
7
+ interface.launch()