ApplyCRs / scripts /build_cr_index.py
heymenn's picture
features: independant from docfinder, eol creds, input ts mode
a610111
#!/usr/bin/env python3
"""
build_cr_index.py β€” Build and push a CR index to a HuggingFace dataset.
Parses all Accepted CRs from the Excel, downloads each one (reusing cached
files), parses the cover page to get (spec_number, version), then pushes a
JSONL index to the HF dataset.
Usage:
python3 build_cr_index.py <excel_path> --output-dir DIR --hf-repo ORG/REPO
Environment variables:
EOL_USER, EOL_PASSWORD β€” ETSI EOL credentials for download_cr
HF_TOKEN β€” HuggingFace token (write access to hf-repo)
"""
import argparse
import datetime
import hashlib
import os
import sys
import time
from pathlib import Path
# ── sys.path setup ────────────────────────────────────────────────────────────
SCRIPT_DIR = Path(__file__).parent
sys.path.insert(0, str(SCRIPT_DIR))
from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path
from hf_cr_index import load_hf_index, push_hf_index
def main():
ap = argparse.ArgumentParser(
description="Build and push CR index to HuggingFace dataset.",
)
ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
ap.add_argument(
"--output-dir",
default=str(Path.home() / "CR_Processing"),
help="Base output directory (default: ~/CR_Processing)",
)
ap.add_argument(
"--hf-repo",
default="OrganizedProgrammers/CR_Index",
help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)",
)
args = ap.parse_args()
eol_user = os.environ.get("EOL_USER", "")
eol_password = os.environ.get("EOL_PASSWORD", "")
hf_token = os.environ.get("HF_TOKEN", "")
if not eol_user or not eol_password:
sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
if not hf_token:
sys.exit("ERROR: HF_TOKEN must be set")
excel_path = Path(wsl_path(args.excel_path))
if not excel_path.exists():
sys.exit(f"ERROR: Excel file not found: {excel_path}")
output_dir = Path(wsl_path(args.output_dir)).expanduser()
cr_dir = output_dir / "CRs"
cr_dir.mkdir(parents=True, exist_ok=True)
# ── 1. Compute Excel hash ─────────────────────────────────────────────────
excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16]
meeting_label = excel_path.stem
print(f"Excel: {excel_path.name}")
print(f"Excel hash: {excel_hash}")
print(f"Meeting: {meeting_label}")
print(f"HF repo: {args.hf_repo}")
print()
# ── 2. Parse all Accepted CRs ─────────────────────────────────────────────
print("Parsing Excel for all Accepted CRs...")
try:
cr_list = parse_excel_all_accepted(str(excel_path))
except Exception as e:
sys.exit(f"ERROR parsing Excel: {e}")
print(f"Found {len(cr_list)} Accepted CR(s)\n")
if not cr_list:
print("Nothing to index.")
sys.exit(0)
# ── 3. Load existing HF index ─────────────────────────────────────────────
print("Loading existing HF index...")
try:
existing = load_hf_index(hf_token, args.hf_repo)
except Exception as e:
print(f" WARNING: could not load existing index: {e}")
existing = []
existing_keys = {(r["excel_hash"], r["uid"]) for r in existing}
print(f" {len(existing)} existing record(s), {len(existing_keys)} unique keys\n")
# ── 4. Download and parse each new CR ─────────────────────────────────────
new_records = []
skipped = 0
failed = []
print("Processing CRs...")
for uid, title, submitted_by in cr_list:
if (excel_hash, uid) in existing_keys:
print(f" [{uid}] already indexed β€” skipping")
skipped += 1
continue
# Retry loop (3 attempts)
docx_path = None
note = ""
for attempt in range(1, 4):
docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
if docx_path:
break
if attempt < 3:
print(f" [{uid}] attempt {attempt}/3 failed ({note}) β€” retrying in 5s")
time.sleep(5)
if not docx_path:
print(f" [{uid}] FAILED β€” {note}")
failed.append((uid, note))
continue
spec_number, version = parse_cr_cover(docx_path)
if not spec_number or not version:
print(f" [{uid}] WARNING: could not parse cover page β€” skipping")
failed.append((uid, "cover page parse failed"))
continue
print(f" [{uid}] -> TS {spec_number} v{version}")
new_records.append({
"excel_hash": excel_hash,
"meeting_label": meeting_label,
"uid": uid,
"title": title,
"submitted_by": submitted_by,
"spec_number": spec_number,
"version": version,
"parsed_at": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
})
print()
print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed")
if failed:
print("Failed CRs:")
for uid, reason in failed:
print(f" [{uid}] {reason}")
print()
# ── 5. Merge and push ─────────────────────────────────────────────────────
if new_records:
all_records = existing + new_records
print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...")
try:
push_hf_index(all_records, hf_token, args.hf_repo)
print(" Push successful")
except Exception as e:
sys.exit(f"ERROR pushing to HF: {e}")
else:
print("No new records to push.")
# ── 6. Sentinel line (watched by app.py) ──────────────────────────────────
print(f"INDEX_COMPLETE excel_hash={excel_hash}")
if __name__ == "__main__":
main()