Updates
Browse files
app.py
CHANGED
|
@@ -68,13 +68,10 @@ def perform_deduplication(
|
|
| 68 |
embeddings = []
|
| 69 |
batch_size = 64
|
| 70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 71 |
-
|
|
|
|
| 72 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 73 |
embeddings.append(batch_embeddings)
|
| 74 |
-
# Update progress
|
| 75 |
-
progress((i + 1) / total_batches, desc="Computing embeddings for Dataset 1")
|
| 76 |
-
# Yield control back to Gradio
|
| 77 |
-
yield status, ""
|
| 78 |
embedding_matrix = np.concatenate(embeddings, axis=0)
|
| 79 |
|
| 80 |
# Deduplicate
|
|
@@ -145,13 +142,9 @@ def perform_deduplication(
|
|
| 145 |
embeddings1 = []
|
| 146 |
batch_size = 64
|
| 147 |
total_batches1 = (len(texts1) + batch_size - 1) // batch_size
|
| 148 |
-
for
|
| 149 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 150 |
embeddings1.append(batch_embeddings)
|
| 151 |
-
# Update progress
|
| 152 |
-
progress((i + 1) / total_batches1, desc="Computing embeddings for Dataset 1")
|
| 153 |
-
# Yield control back to Gradio
|
| 154 |
-
yield status, ""
|
| 155 |
embedding_matrix1 = np.concatenate(embeddings1, axis=0)
|
| 156 |
|
| 157 |
# Compute embeddings for Dataset 2
|
|
@@ -159,13 +152,9 @@ def perform_deduplication(
|
|
| 159 |
yield status, ""
|
| 160 |
embeddings2 = []
|
| 161 |
total_batches2 = (len(texts2) + batch_size - 1) // batch_size
|
| 162 |
-
for
|
| 163 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 164 |
embeddings2.append(batch_embeddings)
|
| 165 |
-
# Update progress
|
| 166 |
-
progress((i + 1) / total_batches2, desc="Computing embeddings for Dataset 2")
|
| 167 |
-
# Yield control back to Gradio
|
| 168 |
-
yield status, ""
|
| 169 |
embedding_matrix2 = np.concatenate(embeddings2, axis=0)
|
| 170 |
|
| 171 |
# Deduplicate across datasets
|
|
|
|
| 68 |
embeddings = []
|
| 69 |
batch_size = 64
|
| 70 |
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 71 |
+
# Use progress.tqdm without yielding inside the loop
|
| 72 |
+
for batch_texts in progress.tqdm(batch_iterable(texts, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches):
|
| 73 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 74 |
embeddings.append(batch_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
embedding_matrix = np.concatenate(embeddings, axis=0)
|
| 76 |
|
| 77 |
# Deduplicate
|
|
|
|
| 142 |
embeddings1 = []
|
| 143 |
batch_size = 64
|
| 144 |
total_batches1 = (len(texts1) + batch_size - 1) // batch_size
|
| 145 |
+
for batch_texts in progress.tqdm(batch_iterable(texts1, batch_size), desc="Computing embeddings for Dataset 1", total=total_batches1):
|
| 146 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 147 |
embeddings1.append(batch_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
embedding_matrix1 = np.concatenate(embeddings1, axis=0)
|
| 149 |
|
| 150 |
# Compute embeddings for Dataset 2
|
|
|
|
| 152 |
yield status, ""
|
| 153 |
embeddings2 = []
|
| 154 |
total_batches2 = (len(texts2) + batch_size - 1) // batch_size
|
| 155 |
+
for batch_texts in progress.tqdm(batch_iterable(texts2, batch_size), desc="Computing embeddings for Dataset 2", total=total_batches2):
|
| 156 |
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 157 |
embeddings2.append(batch_embeddings)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
embedding_matrix2 = np.concatenate(embeddings2, axis=0)
|
| 159 |
|
| 160 |
# Deduplicate across datasets
|