#!/usr/bin/env python3 """ build_cr_index.py — Build and push a CR index to a HuggingFace dataset. Parses all Accepted CRs from the Excel, downloads each one (reusing cached files), parses the cover page to get (spec_number, version), then pushes a JSONL index to the HF dataset. Usage: python3 build_cr_index.py --output-dir DIR --hf-repo ORG/REPO Environment variables: EOL_USER, EOL_PASSWORD — ETSI EOL credentials for download_cr HF_TOKEN — HuggingFace token (write access to hf-repo) """ import argparse import datetime import hashlib import os import sys import time from pathlib import Path # ── sys.path setup ──────────────────────────────────────────────────────────── SCRIPT_DIR = Path(__file__).parent sys.path.insert(0, str(SCRIPT_DIR)) from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path from hf_cr_index import load_hf_index, push_hf_index def main(): ap = argparse.ArgumentParser( description="Build and push CR index to HuggingFace dataset.", ) ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list") ap.add_argument( "--output-dir", default=str(Path.home() / "CR_Processing"), help="Base output directory (default: ~/CR_Processing)", ) ap.add_argument( "--hf-repo", default="OrganizedProgrammers/CR_Index", help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)", ) args = ap.parse_args() eol_user = os.environ.get("EOL_USER", "") eol_password = os.environ.get("EOL_PASSWORD", "") hf_token = os.environ.get("HF_TOKEN", "") if not eol_user or not eol_password: sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set") if not hf_token: sys.exit("ERROR: HF_TOKEN must be set") excel_path = Path(wsl_path(args.excel_path)) if not excel_path.exists(): sys.exit(f"ERROR: Excel file not found: {excel_path}") output_dir = Path(wsl_path(args.output_dir)).expanduser() cr_dir = output_dir / "CRs" cr_dir.mkdir(parents=True, exist_ok=True) # ── 1. Compute Excel hash ───────────────────────────────────────────────── excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16] meeting_label = excel_path.stem print(f"Excel: {excel_path.name}") print(f"Excel hash: {excel_hash}") print(f"Meeting: {meeting_label}") print(f"HF repo: {args.hf_repo}") print() # ── 2. Parse all Accepted CRs ───────────────────────────────────────────── print("Parsing Excel for all Accepted CRs...") try: cr_list = parse_excel_all_accepted(str(excel_path)) except Exception as e: sys.exit(f"ERROR parsing Excel: {e}") print(f"Found {len(cr_list)} Accepted CR(s)\n") if not cr_list: print("Nothing to index.") sys.exit(0) # ── 3. Load existing HF index ───────────────────────────────────────────── print("Loading existing HF index...") try: existing = load_hf_index(hf_token, args.hf_repo) except Exception as e: print(f" WARNING: could not load existing index: {e}") existing = [] existing_keys = {(r["excel_hash"], r["uid"]) for r in existing} print(f" {len(existing)} existing record(s), {len(existing_keys)} unique keys\n") # ── 4. Download and parse each new CR ───────────────────────────────────── new_records = [] skipped = 0 failed = [] print("Processing CRs...") for uid, title, submitted_by in cr_list: if (excel_hash, uid) in existing_keys: print(f" [{uid}] already indexed — skipping") skipped += 1 continue # Retry loop (3 attempts) docx_path = None note = "" for attempt in range(1, 4): docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password) if docx_path: break if attempt < 3: print(f" [{uid}] attempt {attempt}/3 failed ({note}) — retrying in 5s") time.sleep(5) if not docx_path: print(f" [{uid}] FAILED — {note}") failed.append((uid, note)) continue spec_number, version = parse_cr_cover(docx_path) if not spec_number or not version: print(f" [{uid}] WARNING: could not parse cover page — skipping") failed.append((uid, "cover page parse failed")) continue print(f" [{uid}] -> TS {spec_number} v{version}") new_records.append({ "excel_hash": excel_hash, "meeting_label": meeting_label, "uid": uid, "title": title, "submitted_by": submitted_by, "spec_number": spec_number, "version": version, "parsed_at": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), }) print() print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed") if failed: print("Failed CRs:") for uid, reason in failed: print(f" [{uid}] {reason}") print() # ── 5. Merge and push ───────────────────────────────────────────────────── if new_records: all_records = existing + new_records print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...") try: push_hf_index(all_records, hf_token, args.hf_repo) print(" Push successful") except Exception as e: sys.exit(f"ERROR pushing to HF: {e}") else: print("No new records to push.") # ── 6. Sentinel line (watched by app.py) ────────────────────────────────── print(f"INDEX_COMPLETE excel_hash={excel_hash}") if __name__ == "__main__": main()