File size: 1,325 Bytes
c7a6fe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import subprocess
import time

# --- CONFIGURATION ---
LANG = "en"
TOTAL_SHARDS = 20
MAX_CHUNKS_PER_ARTICLE = 5
# ---------------------

def run_preprocessing():
    start_time = time.time()
    
    for shard_id in range(TOTAL_SHARDS):
        print(f"\n{'='*40}")
        print(f"STARTING SHARD {shard_id + 1} OF {TOTAL_SHARDS}")
        print(f"{'='*40}\n")
        
        # Build the command to call your existing script
        command = [
            "python", "/home/mshahidul/readctrl/code/vectordb_build/t.py",
            "--lang", LANG,
            "--shard_id", str(shard_id),
            "--num_shards", str(TOTAL_SHARDS),
            "--max_chunks", str(MAX_CHUNKS_PER_ARTICLE)
        ]
        
        # Run the process and wait for it to finish before starting the next
        try:
            subprocess.run(command, check=True)
            print(f"\nSuccessfully finished Shard {shard_id}")
        except subprocess.CalledProcessError as e:
            print(f"\nError occurred while processing Shard {shard_id}: {e}")
            # Optional: break if you want to stop on error
            # break 

    end_time = time.time()
    duration = (end_time - start_time) / 60
    print(f"\nAll {TOTAL_SHARDS} shards processed in {duration:.2f} minutes.")

if __name__ == "__main__":
    run_preprocessing()