Updated app with code for deduplication
Browse files
app.py
CHANGED
|
@@ -26,27 +26,24 @@ def deduplicate(embedding_matrix: np.ndarray, threshold: float, batch_size: int
|
|
| 26 |
"""
|
| 27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 28 |
"""
|
| 29 |
-
#
|
| 30 |
-
progress.tqdm
|
| 31 |
-
with progress.tqdm(total=1, desc="Building index") as p:
|
| 32 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
| 33 |
p.update(1)
|
| 34 |
|
| 35 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 36 |
duplicate_to_original_mapping = {}
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
progress.tqdm.write("Finding nearest neighbors...")
|
| 40 |
results = reach.nearest_neighbor_threshold(
|
| 41 |
embedding_matrix,
|
| 42 |
threshold=threshold,
|
| 43 |
batch_size=batch_size,
|
| 44 |
-
show_progressbar=
|
| 45 |
)
|
| 46 |
|
| 47 |
-
total_items = len(embedding_matrix)
|
| 48 |
# Processing duplicates with a progress bar
|
| 49 |
-
|
| 50 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
|
| 51 |
if i not in deduplicated_indices:
|
| 52 |
continue
|
|
@@ -64,27 +61,24 @@ def deduplicate_across_datasets(embedding_matrix_1: np.ndarray, embedding_matrix
|
|
| 64 |
"""
|
| 65 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 66 |
"""
|
| 67 |
-
#
|
| 68 |
-
progress.tqdm
|
| 69 |
-
with progress.tqdm(total=1, desc="Building index for Dataset 1") as p:
|
| 70 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
| 71 |
p.update(1)
|
| 72 |
|
| 73 |
duplicate_indices_in_test = []
|
| 74 |
duplicate_to_original_mapping = {}
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
progress.tqdm.write("Finding nearest neighbors between datasets...")
|
| 78 |
results = reach.nearest_neighbor_threshold(
|
| 79 |
embedding_matrix_2,
|
| 80 |
threshold=threshold,
|
| 81 |
batch_size=batch_size,
|
| 82 |
-
show_progressbar=
|
| 83 |
)
|
| 84 |
|
| 85 |
total_items = len(embedding_matrix_2)
|
| 86 |
# Processing duplicates with a progress bar
|
| 87 |
-
progress.tqdm.write("Processing duplicates across datasets...")
|
| 88 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
|
| 89 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
| 90 |
|
|
@@ -128,14 +122,11 @@ def perform_deduplication(
|
|
| 128 |
else:
|
| 129 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
| 130 |
|
| 131 |
-
# Extract texts
|
| 132 |
-
|
| 133 |
-
texts = [example[dataset1_text_column] for example in progress.tqdm(ds, desc="Extracting texts", total=len(ds))]
|
| 134 |
|
| 135 |
-
# Compute embeddings
|
| 136 |
-
|
| 137 |
-
embedding_matrix = model.encode(texts, show_progressbar=False) # Disable internal progress bar
|
| 138 |
-
embedding_matrix = progress.tqdm(embedding_matrix, desc="Computing embeddings", total=len(texts))
|
| 139 |
|
| 140 |
# Deduplicate
|
| 141 |
result_text = deduplicate_and_prepare_results_single(
|
|
@@ -158,22 +149,16 @@ def perform_deduplication(
|
|
| 158 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
| 159 |
|
| 160 |
# Extract texts from Dataset 1
|
| 161 |
-
|
| 162 |
-
texts1 = [example[dataset1_text_column] for example in progress.tqdm(ds1, desc="Extracting texts from Dataset 1", total=len(ds1))]
|
| 163 |
|
| 164 |
# Extract texts from Dataset 2
|
| 165 |
-
|
| 166 |
-
texts2 = [example[dataset2_text_column] for example in progress.tqdm(ds2, desc="Extracting texts from Dataset 2", total=len(ds2))]
|
| 167 |
|
| 168 |
# Compute embeddings for Dataset 1
|
| 169 |
-
|
| 170 |
-
embedding_matrix1 = model.encode(texts1, show_progressbar=False)
|
| 171 |
-
embedding_matrix1 = progress.tqdm(embedding_matrix1, desc="Computing embeddings for Dataset 1", total=len(texts1))
|
| 172 |
|
| 173 |
# Compute embeddings for Dataset 2
|
| 174 |
-
|
| 175 |
-
embedding_matrix2 = model.encode(texts2, show_progressbar=False)
|
| 176 |
-
embedding_matrix2 = progress.tqdm(embedding_matrix2, desc="Computing embeddings for Dataset 2", total=len(texts2))
|
| 177 |
|
| 178 |
# Deduplicate across datasets
|
| 179 |
result_text = deduplicate_and_prepare_results_cross(
|
|
@@ -322,6 +307,7 @@ with gr.Blocks() as demo:
|
|
| 322 |
demo.launch()
|
| 323 |
|
| 324 |
|
|
|
|
| 325 |
# import gradio as gr
|
| 326 |
# from datasets import load_dataset
|
| 327 |
# import numpy as np
|
|
|
|
| 26 |
"""
|
| 27 |
Deduplicate embeddings and return the deduplicated indices and a mapping of removed indices to their corresponding original indices.
|
| 28 |
"""
|
| 29 |
+
# Building the index with a progress bar
|
| 30 |
+
with progress.tqdm(total=1, desc="Building search index") as p:
|
|
|
|
| 31 |
reach = Reach(vectors=embedding_matrix, items=[str(i) for i in range(len(embedding_matrix))])
|
| 32 |
p.update(1)
|
| 33 |
|
| 34 |
deduplicated_indices = set(range(len(embedding_matrix)))
|
| 35 |
duplicate_to_original_mapping = {}
|
| 36 |
|
| 37 |
+
# Finding nearest neighbors
|
|
|
|
| 38 |
results = reach.nearest_neighbor_threshold(
|
| 39 |
embedding_matrix,
|
| 40 |
threshold=threshold,
|
| 41 |
batch_size=batch_size,
|
| 42 |
+
show_progressbar=True # Allow internal progress bar
|
| 43 |
)
|
| 44 |
|
|
|
|
| 45 |
# Processing duplicates with a progress bar
|
| 46 |
+
total_items = len(embedding_matrix)
|
| 47 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates", total=total_items)):
|
| 48 |
if i not in deduplicated_indices:
|
| 49 |
continue
|
|
|
|
| 61 |
"""
|
| 62 |
Deduplicate embeddings across two datasets and return the indices of duplicates between them.
|
| 63 |
"""
|
| 64 |
+
# Building the index from Dataset 1
|
| 65 |
+
with progress.tqdm(total=1, desc="Building search index from Dataset 1") as p:
|
|
|
|
| 66 |
reach = Reach(vectors=embedding_matrix_1, items=[str(i) for i in range(len(embedding_matrix_1))])
|
| 67 |
p.update(1)
|
| 68 |
|
| 69 |
duplicate_indices_in_test = []
|
| 70 |
duplicate_to_original_mapping = {}
|
| 71 |
|
| 72 |
+
# Finding nearest neighbors between datasets
|
|
|
|
| 73 |
results = reach.nearest_neighbor_threshold(
|
| 74 |
embedding_matrix_2,
|
| 75 |
threshold=threshold,
|
| 76 |
batch_size=batch_size,
|
| 77 |
+
show_progressbar=True # Allow internal progress bar
|
| 78 |
)
|
| 79 |
|
| 80 |
total_items = len(embedding_matrix_2)
|
| 81 |
# Processing duplicates with a progress bar
|
|
|
|
| 82 |
for i, similar_items in enumerate(progress.tqdm(results, desc="Processing duplicates across datasets", total=total_items)):
|
| 83 |
similar_indices = [int(item[0]) for item in similar_items if item[1] >= threshold]
|
| 84 |
|
|
|
|
| 122 |
else:
|
| 123 |
ds = load_dataset(dataset1_name, split=dataset1_split)
|
| 124 |
|
| 125 |
+
# Extract texts
|
| 126 |
+
texts = [example[dataset1_text_column] for example in ds]
|
|
|
|
| 127 |
|
| 128 |
+
# Compute embeddings
|
| 129 |
+
embedding_matrix = model.encode(texts, show_progressbar=True) # Enable internal progress bar
|
|
|
|
|
|
|
| 130 |
|
| 131 |
# Deduplicate
|
| 132 |
result_text = deduplicate_and_prepare_results_single(
|
|
|
|
| 149 |
ds2 = load_dataset(dataset2_name, split=dataset2_split)
|
| 150 |
|
| 151 |
# Extract texts from Dataset 1
|
| 152 |
+
texts1 = [example[dataset1_text_column] for example in ds1]
|
|
|
|
| 153 |
|
| 154 |
# Extract texts from Dataset 2
|
| 155 |
+
texts2 = [example[dataset2_text_column] for example in ds2]
|
|
|
|
| 156 |
|
| 157 |
# Compute embeddings for Dataset 1
|
| 158 |
+
embedding_matrix1 = model.encode(texts1, show_progressbar=True)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
# Compute embeddings for Dataset 2
|
| 161 |
+
embedding_matrix2 = model.encode(texts2, show_progressbar=True)
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# Deduplicate across datasets
|
| 164 |
result_text = deduplicate_and_prepare_results_cross(
|
|
|
|
| 307 |
demo.launch()
|
| 308 |
|
| 309 |
|
| 310 |
+
|
| 311 |
# import gradio as gr
|
| 312 |
# from datasets import load_dataset
|
| 313 |
# import numpy as np
|