Spaces:
Sleeping
Sleeping
Commit ·
c56dc56
1
Parent(s): 8433b3c
fix
Browse files
app.py
CHANGED
|
@@ -35,7 +35,7 @@ def encode_batch_gpu(texts, model_name):
|
|
| 35 |
embeddings = model.encode(
|
| 36 |
texts,
|
| 37 |
batch_size=internal_batch_size,
|
| 38 |
-
show_progress_bar=
|
| 39 |
convert_to_numpy=True,
|
| 40 |
)
|
| 41 |
|
|
@@ -113,6 +113,26 @@ def process_dataset(model_name, progress=gr.Progress()):
|
|
| 113 |
|
| 114 |
processed_count += len(batch_indices)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
except Exception as e:
|
| 117 |
print(f"Error during GPU encoding batch {i}: {e}")
|
| 118 |
error_occurred = True
|
|
|
|
| 35 |
embeddings = model.encode(
|
| 36 |
texts,
|
| 37 |
batch_size=internal_batch_size,
|
| 38 |
+
show_progress_bar=True,
|
| 39 |
convert_to_numpy=True,
|
| 40 |
)
|
| 41 |
|
|
|
|
| 113 |
|
| 114 |
processed_count += len(batch_indices)
|
| 115 |
|
| 116 |
+
# --- Checkpoint Saving ---
|
| 117 |
+
print(
|
| 118 |
+
f"Batch completed. Saving checkpoint for {processed_count} processed rows..."
|
| 119 |
+
)
|
| 120 |
+
# Save locally first (fast)
|
| 121 |
+
df.to_parquet("embeddings_checkpoint.parquet")
|
| 122 |
+
|
| 123 |
+
# Push to Hub (slower but persistent across machines)
|
| 124 |
+
if hf_token and REPO_ID:
|
| 125 |
+
try:
|
| 126 |
+
# Convert only if necessary or optimize
|
| 127 |
+
# Creating a new dataset every time might apply memory pressure
|
| 128 |
+
# but it is what ensures the Hub is up to date
|
| 129 |
+
temp_ds = Dataset.from_pandas(df)
|
| 130 |
+
temp_ds.push_to_hub(REPO_ID, token=hf_token)
|
| 131 |
+
print("Checkpoint pushed to Hub.")
|
| 132 |
+
del temp_ds
|
| 133 |
+
except Exception as hub_err:
|
| 134 |
+
print(f"Warning: Failed to push checkpoint to Hub: {hub_err}")
|
| 135 |
+
|
| 136 |
except Exception as e:
|
| 137 |
print(f"Error during GPU encoding batch {i}: {e}")
|
| 138 |
error_occurred = True
|