LordofMonarchs's picture
Upload folder using huggingface_hub
3751f09 verified
Raw
History Blame Contribute Delete
4.64 kB
#!/usr/bin/env python3
import os
import sys
import time
import subprocess
import argparse
_SCRIPTS_DIR = os.path.dirname(os.path.abspath(__file__))
_PROJECT_ROOT = os.path.dirname(_SCRIPTS_DIR)
def check_artifacts_up_to_date(precomputed_dir: str, candidates_path: str) -> bool:
"""Check if precomputed artifacts exist and are newer than candidates.jsonl."""
required_files = [
"bm25_index.pkl",
"candidate_ids.pkl",
"lgbm_model.pkl",
"static_features.pkl",
"vocab.pkl",
"bm25_matrix.npz",
"candidate_offsets.pkl",
"lgbm_model.txt"
]
for f in required_files:
fpath = os.path.join(precomputed_dir, f)
if not os.path.isfile(fpath):
return False
# mtime vs candidates.jsonl
if os.path.isfile(candidates_path):
if os.path.getmtime(fpath) < os.path.getmtime(candidates_path):
return False
return True
def run_step(command_list, step_label, step_num):
print(f"\n[{step_num}/3] Running {step_label}...")
t0 = time.time()
# process
result = subprocess.run(command_list, capture_output=True, text=True)
elapsed = time.time() - t0
if result.returncode != 0:
print(f"\n[ERROR] Step {step_num}/3 ({step_label}) FAILED (Exit Code: {result.returncode})")
print("--- STDOUT ---")
print(result.stdout)
print("--- STDERR ---")
print(result.stderr)
sys.exit(result.returncode)
print(result.stdout.strip())
print(f"[{step_num}/3] {step_label.capitalize()} complete ({elapsed:.2f}s)")
return elapsed
def main():
parser = argparse.ArgumentParser(description="Redrob Ranking Pipeline Runner")
parser.add_argument("--candidates", default="./candidates.jsonl", help="Path to candidates JSONL")
parser.add_argument("--out", default="./CTRL_COFFEE_REPEAT.csv", help="Path to output CSV")
parser.add_argument("--force-precompute", action="store_true", help="Force rebuild precompute artifacts")
args = parser.parse_args()
candidates_path = os.path.abspath(args.candidates)
out_path = os.path.abspath(args.out)
precomputed_dir = os.path.join(_PROJECT_ROOT, "precomputed")
t_start = time.time()
artifacts_ready = check_artifacts_up_to_date(precomputed_dir, candidates_path)
python_exe = sys.executable
t_precompute = 0.0
if not artifacts_ready or args.force_precompute:
cmd = [python_exe, "scripts/precompute.py", "--candidates", candidates_path, "--base-dir", _PROJECT_ROOT]
t_precompute = run_step(cmd, "precompute", 1)
else:
print("\n[1/3] Precompute skipped (artifacts up to date)")
# rank
cmd = [python_exe, "src/rank.py", "--candidates", candidates_path, "--out", out_path, "--base-dir", _PROJECT_ROOT]
t_rank = run_step(cmd, "rank", 2)
# validate
cmd = [python_exe, "scripts/validate_submission.py", "--submission", out_path]
t_validate = run_step(cmd, "validate_submission", 3)
total_wall = time.time() - t_start
print("\n" + "=" * 60)
print("PIPELINE EXECUTION SUMMARY")
print("=" * 60)
print(f" Total Clock Time: {total_wall:.2f} seconds")
print(f" Step 1 (Precompute): {t_precompute:.2f}s" if t_precompute > 0 else " Step 1 (Precompute): Skipped (up to date)")
print(f" Step 2 (Ranking): {t_rank:.2f}s")
print(f" Step 3 (Validation): {t_validate:.2f}s")
if os.path.isfile(out_path):
try:
import pandas as pd
df = pd.read_csv(out_path)
if len(df) == 100:
print(" CONFIRMED: submission.csv exists with exactly 100 rows.")
else:
print(f" [ERROR] submission.csv has {len(df)} rows, expected exactly 100!")
sys.exit(1)
except Exception as e:
print(f" [ERROR] Error reading CSV: {e}")
sys.exit(1)
else:
print(" [ERROR] Missing output file submission.csv!")
sys.exit(1)
log_dir = os.path.join(_PROJECT_ROOT, "logs")
if os.path.isdir(log_dir):
logs = [os.path.join(log_dir, f) for f in os.listdir(log_dir) if f.startswith("rank_")]
if logs:
latest_log = max(logs, key=os.path.getmtime)
print(f" Latest Log File: {latest_log}")
print("=" * 60)
if __name__ == "__main__":
main()