import subprocess import time # --- CONFIGURATION --- LANG = "en" TOTAL_SHARDS = 20 MAX_CHUNKS_PER_ARTICLE = 5 # --------------------- def run_preprocessing(): start_time = time.time() for shard_id in range(TOTAL_SHARDS): print(f"\n{'='*40}") print(f"STARTING SHARD {shard_id + 1} OF {TOTAL_SHARDS}") print(f"{'='*40}\n") # Build the command to call your existing script command = [ "python", "/home/mshahidul/readctrl/code/vectordb_build/t.py", "--lang", LANG, "--shard_id", str(shard_id), "--num_shards", str(TOTAL_SHARDS), "--max_chunks", str(MAX_CHUNKS_PER_ARTICLE) ] # Run the process and wait for it to finish before starting the next try: subprocess.run(command, check=True) print(f"\nSuccessfully finished Shard {shard_id}") except subprocess.CalledProcessError as e: print(f"\nError occurred while processing Shard {shard_id}: {e}") # Optional: break if you want to stop on error # break end_time = time.time() duration = (end_time - start_time) / 60 print(f"\nAll {TOTAL_SHARDS} shards processed in {duration:.2f} minutes.") if __name__ == "__main__": run_preprocessing()