Spaces:

stevenbucaille
/

semantic-code-repo

Sleeping

stevenbucaille commited on Feb 5

Commit

c56dc56

1 Parent(s): 8433b3c

fix

Files changed (1) hide show

app.py CHANGED Viewed

@@ -35,7 +35,7 @@ def encode_batch_gpu(texts, model_name):
     embeddings = model.encode(
         texts,
         batch_size=internal_batch_size,
-        show_progress_bar=False,
         convert_to_numpy=True,
     )
@@ -113,6 +113,26 @@ def process_dataset(model_name, progress=gr.Progress()):
                 processed_count += len(batch_indices)
             except Exception as e:
                 print(f"Error during GPU encoding batch {i}: {e}")
                 error_occurred = True

     embeddings = model.encode(
         texts,
         batch_size=internal_batch_size,
+        show_progress_bar=True,
         convert_to_numpy=True,
     )
                 processed_count += len(batch_indices)
+                # --- Checkpoint Saving ---
+                print(
+                    f"Batch completed. Saving checkpoint for {processed_count} processed rows..."
+                )
+                # Save locally first (fast)
+                df.to_parquet("embeddings_checkpoint.parquet")
+                # Push to Hub (slower but persistent across machines)
+                if hf_token and REPO_ID:
+                    try:
+                        # Convert only if necessary or optimize
+                        # Creating a new dataset every time might apply memory pressure
+                        # but it is what ensures the Hub is up to date
+                        temp_ds = Dataset.from_pandas(df)
+                        temp_ds.push_to_hub(REPO_ID, token=hf_token)
+                        print("Checkpoint pushed to Hub.")
+                        del temp_ds
+                    except Exception as hub_err:
+                        print(f"Warning: Failed to push checkpoint to Hub: {hub_err}")
             except Exception as e:
                 print(f"Error during GPU encoding batch {i}: {e}")
                 error_occurred = True