Spaces:

OrganizedProgrammers
/

ApplyCRs

Sleeping

File size: 6,540 Bytes

a610111

#!/usr/bin/env python3
"""
build_cr_index.py — Build and push a CR index to a HuggingFace dataset.

Parses all Accepted CRs from the Excel, downloads each one (reusing cached
files), parses the cover page to get (spec_number, version), then pushes a
JSONL index to the HF dataset.

Usage:
    python3 build_cr_index.py <excel_path> --output-dir DIR --hf-repo ORG/REPO

Environment variables:
    EOL_USER, EOL_PASSWORD  — ETSI EOL credentials for download_cr
    HF_TOKEN                — HuggingFace token (write access to hf-repo)
"""

import argparse
import datetime
import hashlib
import os
import sys
import time
from pathlib import Path

# ── sys.path setup ────────────────────────────────────────────────────────────
SCRIPT_DIR = Path(__file__).parent
sys.path.insert(0, str(SCRIPT_DIR))

from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path
from hf_cr_index import load_hf_index, push_hf_index


def main():
    ap = argparse.ArgumentParser(
        description="Build and push CR index to HuggingFace dataset.",
    )
    ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
    ap.add_argument(
        "--output-dir",
        default=str(Path.home() / "CR_Processing"),
        help="Base output directory (default: ~/CR_Processing)",
    )
    ap.add_argument(
        "--hf-repo",
        default="OrganizedProgrammers/CR_Index",
        help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)",
    )
    args = ap.parse_args()

    eol_user = os.environ.get("EOL_USER", "")
    eol_password = os.environ.get("EOL_PASSWORD", "")
    hf_token = os.environ.get("HF_TOKEN", "")

    if not eol_user or not eol_password:
        sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
    if not hf_token:
        sys.exit("ERROR: HF_TOKEN must be set")

    excel_path = Path(wsl_path(args.excel_path))
    if not excel_path.exists():
        sys.exit(f"ERROR: Excel file not found: {excel_path}")

    output_dir = Path(wsl_path(args.output_dir)).expanduser()
    cr_dir = output_dir / "CRs"
    cr_dir.mkdir(parents=True, exist_ok=True)

    # ── 1. Compute Excel hash ─────────────────────────────────────────────────
    excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16]
    meeting_label = excel_path.stem
    print(f"Excel:        {excel_path.name}")
    print(f"Excel hash:   {excel_hash}")
    print(f"Meeting:      {meeting_label}")
    print(f"HF repo:      {args.hf_repo}")
    print()

    # ── 2. Parse all Accepted CRs ─────────────────────────────────────────────
    print("Parsing Excel for all Accepted CRs...")
    try:
        cr_list = parse_excel_all_accepted(str(excel_path))
    except Exception as e:
        sys.exit(f"ERROR parsing Excel: {e}")
    print(f"Found {len(cr_list)} Accepted CR(s)\n")

    if not cr_list:
        print("Nothing to index.")
        sys.exit(0)

    # ── 3. Load existing HF index ─────────────────────────────────────────────
    print("Loading existing HF index...")
    try:
        existing = load_hf_index(hf_token, args.hf_repo)
    except Exception as e:
        print(f"  WARNING: could not load existing index: {e}")
        existing = []
    existing_keys = {(r["excel_hash"], r["uid"]) for r in existing}
    print(f"  {len(existing)} existing record(s), {len(existing_keys)} unique keys\n")

    # ── 4. Download and parse each new CR ─────────────────────────────────────
    new_records = []
    skipped = 0
    failed = []

    print("Processing CRs...")
    for uid, title, submitted_by in cr_list:
        if (excel_hash, uid) in existing_keys:
            print(f"  [{uid}] already indexed — skipping")
            skipped += 1
            continue

        # Retry loop (3 attempts)
        docx_path = None
        note = ""
        for attempt in range(1, 4):
            docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
            if docx_path:
                break
            if attempt < 3:
                print(f"  [{uid}] attempt {attempt}/3 failed ({note}) — retrying in 5s")
                time.sleep(5)

        if not docx_path:
            print(f"  [{uid}] FAILED — {note}")
            failed.append((uid, note))
            continue

        spec_number, version = parse_cr_cover(docx_path)
        if not spec_number or not version:
            print(f"  [{uid}] WARNING: could not parse cover page — skipping")
            failed.append((uid, "cover page parse failed"))
            continue

        print(f"  [{uid}] -> TS {spec_number} v{version}")
        new_records.append({
            "excel_hash":    excel_hash,
            "meeting_label": meeting_label,
            "uid":           uid,
            "title":         title,
            "submitted_by":  submitted_by,
            "spec_number":   spec_number,
            "version":       version,
            "parsed_at":     datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
        })

    print()
    print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed")
    if failed:
        print("Failed CRs:")
        for uid, reason in failed:
            print(f"  [{uid}] {reason}")
    print()

    # ── 5. Merge and push ─────────────────────────────────────────────────────
    if new_records:
        all_records = existing + new_records
        print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...")
        try:
            push_hf_index(all_records, hf_token, args.hf_repo)
            print("  Push successful")
        except Exception as e:
            sys.exit(f"ERROR pushing to HF: {e}")
    else:
        print("No new records to push.")

    # ── 6. Sentinel line (watched by app.py) ──────────────────────────────────
    print(f"INDEX_COMPLETE excel_hash={excel_hash}")


if __name__ == "__main__":
    main()