shahidul034's picture
Add files using upload-large-folder tool
c7a6fe6 verified
import subprocess
import time
# --- CONFIGURATION ---
LANG = "en"
TOTAL_SHARDS = 20
MAX_CHUNKS_PER_ARTICLE = 5
# ---------------------
def run_preprocessing():
start_time = time.time()
for shard_id in range(TOTAL_SHARDS):
print(f"\n{'='*40}")
print(f"STARTING SHARD {shard_id + 1} OF {TOTAL_SHARDS}")
print(f"{'='*40}\n")
# Build the command to call your existing script
command = [
"python", "/home/mshahidul/readctrl/code/vectordb_build/t.py",
"--lang", LANG,
"--shard_id", str(shard_id),
"--num_shards", str(TOTAL_SHARDS),
"--max_chunks", str(MAX_CHUNKS_PER_ARTICLE)
]
# Run the process and wait for it to finish before starting the next
try:
subprocess.run(command, check=True)
print(f"\nSuccessfully finished Shard {shard_id}")
except subprocess.CalledProcessError as e:
print(f"\nError occurred while processing Shard {shard_id}: {e}")
# Optional: break if you want to stop on error
# break
end_time = time.time()
duration = (end_time - start_time) / 60
print(f"\nAll {TOTAL_SHARDS} shards processed in {duration:.2f} minutes.")
if __name__ == "__main__":
run_preprocessing()