Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| build_cr_index.py β Build and push a CR index to a HuggingFace dataset. | |
| Parses all Accepted CRs from the Excel, downloads each one (reusing cached | |
| files), parses the cover page to get (spec_number, version), then pushes a | |
| JSONL index to the HF dataset. | |
| Usage: | |
| python3 build_cr_index.py <excel_path> --output-dir DIR --hf-repo ORG/REPO | |
| Environment variables: | |
| EOL_USER, EOL_PASSWORD β ETSI EOL credentials for download_cr | |
| HF_TOKEN β HuggingFace token (write access to hf-repo) | |
| """ | |
| import argparse | |
| import datetime | |
| import hashlib | |
| import os | |
| import sys | |
| import time | |
| from pathlib import Path | |
| # ββ sys.path setup ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SCRIPT_DIR = Path(__file__).parent | |
| sys.path.insert(0, str(SCRIPT_DIR)) | |
| from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path | |
| from hf_cr_index import load_hf_index, push_hf_index | |
| def main(): | |
| ap = argparse.ArgumentParser( | |
| description="Build and push CR index to HuggingFace dataset.", | |
| ) | |
| ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list") | |
| ap.add_argument( | |
| "--output-dir", | |
| default=str(Path.home() / "CR_Processing"), | |
| help="Base output directory (default: ~/CR_Processing)", | |
| ) | |
| ap.add_argument( | |
| "--hf-repo", | |
| default="OrganizedProgrammers/CR_Index", | |
| help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)", | |
| ) | |
| args = ap.parse_args() | |
| eol_user = os.environ.get("EOL_USER", "") | |
| eol_password = os.environ.get("EOL_PASSWORD", "") | |
| hf_token = os.environ.get("HF_TOKEN", "") | |
| if not eol_user or not eol_password: | |
| sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set") | |
| if not hf_token: | |
| sys.exit("ERROR: HF_TOKEN must be set") | |
| excel_path = Path(wsl_path(args.excel_path)) | |
| if not excel_path.exists(): | |
| sys.exit(f"ERROR: Excel file not found: {excel_path}") | |
| output_dir = Path(wsl_path(args.output_dir)).expanduser() | |
| cr_dir = output_dir / "CRs" | |
| cr_dir.mkdir(parents=True, exist_ok=True) | |
| # ββ 1. Compute Excel hash βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16] | |
| meeting_label = excel_path.stem | |
| print(f"Excel: {excel_path.name}") | |
| print(f"Excel hash: {excel_hash}") | |
| print(f"Meeting: {meeting_label}") | |
| print(f"HF repo: {args.hf_repo}") | |
| print() | |
| # ββ 2. Parse all Accepted CRs βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Parsing Excel for all Accepted CRs...") | |
| try: | |
| cr_list = parse_excel_all_accepted(str(excel_path)) | |
| except Exception as e: | |
| sys.exit(f"ERROR parsing Excel: {e}") | |
| print(f"Found {len(cr_list)} Accepted CR(s)\n") | |
| if not cr_list: | |
| print("Nothing to index.") | |
| sys.exit(0) | |
| # ββ 3. Load existing HF index βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading existing HF index...") | |
| try: | |
| existing = load_hf_index(hf_token, args.hf_repo) | |
| except Exception as e: | |
| print(f" WARNING: could not load existing index: {e}") | |
| existing = [] | |
| existing_keys = {(r["excel_hash"], r["uid"]) for r in existing} | |
| print(f" {len(existing)} existing record(s), {len(existing_keys)} unique keys\n") | |
| # ββ 4. Download and parse each new CR βββββββββββββββββββββββββββββββββββββ | |
| new_records = [] | |
| skipped = 0 | |
| failed = [] | |
| print("Processing CRs...") | |
| for uid, title, submitted_by in cr_list: | |
| if (excel_hash, uid) in existing_keys: | |
| print(f" [{uid}] already indexed β skipping") | |
| skipped += 1 | |
| continue | |
| # Retry loop (3 attempts) | |
| docx_path = None | |
| note = "" | |
| for attempt in range(1, 4): | |
| docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password) | |
| if docx_path: | |
| break | |
| if attempt < 3: | |
| print(f" [{uid}] attempt {attempt}/3 failed ({note}) β retrying in 5s") | |
| time.sleep(5) | |
| if not docx_path: | |
| print(f" [{uid}] FAILED β {note}") | |
| failed.append((uid, note)) | |
| continue | |
| spec_number, version = parse_cr_cover(docx_path) | |
| if not spec_number or not version: | |
| print(f" [{uid}] WARNING: could not parse cover page β skipping") | |
| failed.append((uid, "cover page parse failed")) | |
| continue | |
| print(f" [{uid}] -> TS {spec_number} v{version}") | |
| new_records.append({ | |
| "excel_hash": excel_hash, | |
| "meeting_label": meeting_label, | |
| "uid": uid, | |
| "title": title, | |
| "submitted_by": submitted_by, | |
| "spec_number": spec_number, | |
| "version": version, | |
| "parsed_at": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), | |
| }) | |
| print() | |
| print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed") | |
| if failed: | |
| print("Failed CRs:") | |
| for uid, reason in failed: | |
| print(f" [{uid}] {reason}") | |
| print() | |
| # ββ 5. Merge and push βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if new_records: | |
| all_records = existing + new_records | |
| print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...") | |
| try: | |
| push_hf_index(all_records, hf_token, args.hf_repo) | |
| print(" Push successful") | |
| except Exception as e: | |
| sys.exit(f"ERROR pushing to HF: {e}") | |
| else: | |
| print("No new records to push.") | |
| # ββ 6. Sentinel line (watched by app.py) ββββββββββββββββββββββββββββββββββ | |
| print(f"INDEX_COMPLETE excel_hash={excel_hash}") | |
| if __name__ == "__main__": | |
| main() | |