Spaces:

OrganizedProgrammers
/

ApplyCRs

Sleeping

App Files Files Community

ApplyCRs / scripts /build_cr_index.py

heymenn

features: independant from docfinder, eol creds, input ts mode

a610111 23 days ago

raw

history blame contribute delete

6.54 kB

	#!/usr/bin/env python3
	"""
	build_cr_index.py — Build and push a CR index to a HuggingFace dataset.

	Parses all Accepted CRs from the Excel, downloads each one (reusing cached
	files), parses the cover page to get (spec_number, version), then pushes a
	JSONL index to the HF dataset.

	Usage:
	python3 build_cr_index.py <excel_path> --output-dir DIR --hf-repo ORG/REPO

	Environment variables:
	EOL_USER, EOL_PASSWORD — ETSI EOL credentials for download_cr
	HF_TOKEN — HuggingFace token (write access to hf-repo)
	"""

	import argparse
	import datetime
	import hashlib
	import os
	import sys
	import time
	from pathlib import Path

	# ── sys.path setup ────────────────────────────────────────────────────────────
	SCRIPT_DIR = Path(__file__).parent
	sys.path.insert(0, str(SCRIPT_DIR))

	from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path
	from hf_cr_index import load_hf_index, push_hf_index


	def main():
	ap = argparse.ArgumentParser(
	description="Build and push CR index to HuggingFace dataset.",
	)
	ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
	ap.add_argument(
	"--output-dir",
	default=str(Path.home() / "CR_Processing"),
	help="Base output directory (default: ~/CR_Processing)",
	)
	ap.add_argument(
	"--hf-repo",
	default="OrganizedProgrammers/CR_Index",
	help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)",
	)
	args = ap.parse_args()

	eol_user = os.environ.get("EOL_USER", "")
	eol_password = os.environ.get("EOL_PASSWORD", "")
	hf_token = os.environ.get("HF_TOKEN", "")

	if not eol_user or not eol_password:
	sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
	if not hf_token:
	sys.exit("ERROR: HF_TOKEN must be set")

	excel_path = Path(wsl_path(args.excel_path))
	if not excel_path.exists():
	sys.exit(f"ERROR: Excel file not found: {excel_path}")

	output_dir = Path(wsl_path(args.output_dir)).expanduser()
	cr_dir = output_dir / "CRs"
	cr_dir.mkdir(parents=True, exist_ok=True)

	# ── 1. Compute Excel hash ─────────────────────────────────────────────────
	excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16]
	meeting_label = excel_path.stem
	print(f"Excel: {excel_path.name}")
	print(f"Excel hash: {excel_hash}")
	print(f"Meeting: {meeting_label}")
	print(f"HF repo: {args.hf_repo}")
	print()

	# ── 2. Parse all Accepted CRs ─────────────────────────────────────────────
	print("Parsing Excel for all Accepted CRs...")
	try:
	cr_list = parse_excel_all_accepted(str(excel_path))
	except Exception as e:
	sys.exit(f"ERROR parsing Excel: {e}")
	print(f"Found {len(cr_list)} Accepted CR(s)\n")

	if not cr_list:
	print("Nothing to index.")
	sys.exit(0)

	# ── 3. Load existing HF index ─────────────────────────────────────────────
	print("Loading existing HF index...")
	try:
	existing = load_hf_index(hf_token, args.hf_repo)
	except Exception as e:
	print(f" WARNING: could not load existing index: {e}")
	existing = []
	existing_keys = {(r["excel_hash"], r["uid"]) for r in existing}
	print(f" {len(existing)} existing record(s), {len(existing_keys)} unique keys\n")

	# ── 4. Download and parse each new CR ─────────────────────────────────────
	new_records = []
	skipped = 0
	failed = []

	print("Processing CRs...")
	for uid, title, submitted_by in cr_list:
	if (excel_hash, uid) in existing_keys:
	print(f" [{uid}] already indexed — skipping")
	skipped += 1
	continue

	# Retry loop (3 attempts)
	docx_path = None
	note = ""
	for attempt in range(1, 4):
	docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
	if docx_path:
	break
	if attempt < 3:
	print(f" [{uid}] attempt {attempt}/3 failed ({note}) — retrying in 5s")
	time.sleep(5)

	if not docx_path:
	print(f" [{uid}] FAILED — {note}")
	failed.append((uid, note))
	continue

	spec_number, version = parse_cr_cover(docx_path)
	if not spec_number or not version:
	print(f" [{uid}] WARNING: could not parse cover page — skipping")
	failed.append((uid, "cover page parse failed"))
	continue

	print(f" [{uid}] -> TS {spec_number} v{version}")
	new_records.append({
	"excel_hash": excel_hash,
	"meeting_label": meeting_label,
	"uid": uid,
	"title": title,
	"submitted_by": submitted_by,
	"spec_number": spec_number,
	"version": version,
	"parsed_at": datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
	})

	print()
	print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed")
	if failed:
	print("Failed CRs:")
	for uid, reason in failed:
	print(f" [{uid}] {reason}")
	print()

	# ── 5. Merge and push ─────────────────────────────────────────────────────
	if new_records:
	all_records = existing + new_records
	print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...")
	try:
	push_hf_index(all_records, hf_token, args.hf_repo)
	print(" Push successful")
	except Exception as e:
	sys.exit(f"ERROR pushing to HF: {e}")
	else:
	print("No new records to push.")

	# ── 6. Sentinel line (watched by app.py) ──────────────────────────────────
	print(f"INDEX_COMPLETE excel_hash={excel_hash}")


	if __name__ == "__main__":
	main()