| import subprocess | |
| import time | |
| # --- CONFIGURATION --- | |
| LANG = "en" | |
| TOTAL_SHARDS = 20 | |
| MAX_CHUNKS_PER_ARTICLE = 5 | |
| # --------------------- | |
| def run_preprocessing(): | |
| start_time = time.time() | |
| for shard_id in range(TOTAL_SHARDS): | |
| print(f"\n{'='*40}") | |
| print(f"STARTING SHARD {shard_id + 1} OF {TOTAL_SHARDS}") | |
| print(f"{'='*40}\n") | |
| # Build the command to call your existing script | |
| command = [ | |
| "python", "/home/mshahidul/readctrl/code/vectordb_build/t.py", | |
| "--lang", LANG, | |
| "--shard_id", str(shard_id), | |
| "--num_shards", str(TOTAL_SHARDS), | |
| "--max_chunks", str(MAX_CHUNKS_PER_ARTICLE) | |
| ] | |
| # Run the process and wait for it to finish before starting the next | |
| try: | |
| subprocess.run(command, check=True) | |
| print(f"\nSuccessfully finished Shard {shard_id}") | |
| except subprocess.CalledProcessError as e: | |
| print(f"\nError occurred while processing Shard {shard_id}: {e}") | |
| # Optional: break if you want to stop on error | |
| # break | |
| end_time = time.time() | |
| duration = (end_time - start_time) / 60 | |
| print(f"\nAll {TOTAL_SHARDS} shards processed in {duration:.2f} minutes.") | |
| if __name__ == "__main__": | |
| run_preprocessing() |