stevenbucaille commited on
Commit
c56dc56
·
1 Parent(s): 8433b3c
Files changed (1) hide show
  1. app.py +21 -1
app.py CHANGED
@@ -35,7 +35,7 @@ def encode_batch_gpu(texts, model_name):
35
  embeddings = model.encode(
36
  texts,
37
  batch_size=internal_batch_size,
38
- show_progress_bar=False,
39
  convert_to_numpy=True,
40
  )
41
 
@@ -113,6 +113,26 @@ def process_dataset(model_name, progress=gr.Progress()):
113
 
114
  processed_count += len(batch_indices)
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  except Exception as e:
117
  print(f"Error during GPU encoding batch {i}: {e}")
118
  error_occurred = True
 
35
  embeddings = model.encode(
36
  texts,
37
  batch_size=internal_batch_size,
38
+ show_progress_bar=True,
39
  convert_to_numpy=True,
40
  )
41
 
 
113
 
114
  processed_count += len(batch_indices)
115
 
116
+ # --- Checkpoint Saving ---
117
+ print(
118
+ f"Batch completed. Saving checkpoint for {processed_count} processed rows..."
119
+ )
120
+ # Save locally first (fast)
121
+ df.to_parquet("embeddings_checkpoint.parquet")
122
+
123
+ # Push to Hub (slower but persistent across machines)
124
+ if hf_token and REPO_ID:
125
+ try:
126
+ # Convert only if necessary or optimize
127
+ # Creating a new dataset every time might apply memory pressure
128
+ # but it is what ensures the Hub is up to date
129
+ temp_ds = Dataset.from_pandas(df)
130
+ temp_ds.push_to_hub(REPO_ID, token=hf_token)
131
+ print("Checkpoint pushed to Hub.")
132
+ del temp_ds
133
+ except Exception as hub_err:
134
+ print(f"Warning: Failed to push checkpoint to Hub: {hub_err}")
135
+
136
  except Exception as e:
137
  print(f"Error during GPU encoding batch {i}: {e}")
138
  error_occurred = True