Updates
Browse files
app.py
CHANGED
|
@@ -4,8 +4,6 @@ import numpy as np
|
|
| 4 |
from model2vec import StaticModel
|
| 5 |
from reach import Reach
|
| 6 |
from difflib import ndiff
|
| 7 |
-
import tqdm
|
| 8 |
-
from contextlib import contextmanager
|
| 9 |
|
| 10 |
# Load the model at startup
|
| 11 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
|
@@ -27,19 +25,14 @@ def batch_iterable(iterable, batch_size):
|
|
| 27 |
for i in range(0, len(iterable), batch_size):
|
| 28 |
yield iterable[i:i + batch_size]
|
| 29 |
|
| 30 |
-
@contextmanager
|
| 31 |
-
def tqdm_redirect(progress):
|
| 32 |
-
original_tqdm = tqdm.tqdm
|
| 33 |
-
try:
|
| 34 |
-
tqdm.tqdm = progress.tqdm
|
| 35 |
-
yield
|
| 36 |
-
finally:
|
| 37 |
-
tqdm.tqdm = original_tqdm
|
| 38 |
-
|
| 39 |
def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
def deduplicate(
|
| 45 |
embedding_matrix: np.ndarray,
|
|
@@ -299,7 +292,8 @@ def deduplicate_across_datasets(
|
|
| 299 |
|
| 300 |
return duplicate_indices_in_test, duplicate_to_original_mapping
|
| 301 |
|
| 302 |
-
|
|
|
|
| 303 |
gr.Markdown("# Semantic Deduplication")
|
| 304 |
|
| 305 |
deduplication_type = gr.Radio(
|
|
@@ -327,8 +321,8 @@ with gr.Blocks() as demo:
|
|
| 327 |
|
| 328 |
compute_button = gr.Button("Compute")
|
| 329 |
|
| 330 |
-
# Use '
|
| 331 |
-
status_output = gr.
|
| 332 |
result_output = gr.Markdown()
|
| 333 |
|
| 334 |
# Function to update the visibility of dataset2_inputs
|
|
|
|
| 4 |
from model2vec import StaticModel
|
| 5 |
from reach import Reach
|
| 6 |
from difflib import ndiff
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Load the model at startup
|
| 9 |
model = StaticModel.from_pretrained("minishlab/M2V_base_output")
|
|
|
|
| 25 |
for i in range(0, len(iterable), batch_size):
|
| 26 |
yield iterable[i:i + batch_size]
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def compute_embeddings(texts, batch_size, progress, desc="Computing embeddings"):
|
| 29 |
+
embeddings = []
|
| 30 |
+
total_batches = (len(texts) + batch_size - 1) // batch_size
|
| 31 |
+
for i, batch_texts in enumerate(batch_iterable(texts, batch_size)):
|
| 32 |
+
batch_embeddings = model.encode(batch_texts, show_progressbar=False)
|
| 33 |
+
embeddings.append(batch_embeddings)
|
| 34 |
+
progress((i + 1) / total_batches, desc=desc)
|
| 35 |
+
return np.concatenate(embeddings, axis=0)
|
| 36 |
|
| 37 |
def deduplicate(
|
| 38 |
embedding_matrix: np.ndarray,
|
|
|
|
| 292 |
|
| 293 |
return duplicate_indices_in_test, duplicate_to_original_mapping
|
| 294 |
|
| 295 |
+
# Adjust the height of the status_output component using custom CSS
|
| 296 |
+
with gr.Blocks(css="#status_output { height: 150px; overflow: auto; }") as demo:
|
| 297 |
gr.Markdown("# Semantic Deduplication")
|
| 298 |
|
| 299 |
deduplication_type = gr.Radio(
|
|
|
|
| 321 |
|
| 322 |
compute_button = gr.Button("Compute")
|
| 323 |
|
| 324 |
+
# Use 'gr.Markdown' with 'elem_id' and custom CSS to adjust height
|
| 325 |
+
status_output = gr.Markdown(elem_id="status_output")
|
| 326 |
result_output = gr.Markdown()
|
| 327 |
|
| 328 |
# Function to update the visibility of dataset2_inputs
|