File size: 6,540 Bytes
a610111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/env python3
"""
build_cr_index.py β€” Build and push a CR index to a HuggingFace dataset.

Parses all Accepted CRs from the Excel, downloads each one (reusing cached
files), parses the cover page to get (spec_number, version), then pushes a
JSONL index to the HF dataset.

Usage:
    python3 build_cr_index.py <excel_path> --output-dir DIR --hf-repo ORG/REPO

Environment variables:
    EOL_USER, EOL_PASSWORD  β€” ETSI EOL credentials for download_cr
    HF_TOKEN                β€” HuggingFace token (write access to hf-repo)
"""

import argparse
import datetime
import hashlib
import os
import sys
import time
from pathlib import Path

# ── sys.path setup ────────────────────────────────────────────────────────────
SCRIPT_DIR = Path(__file__).parent
sys.path.insert(0, str(SCRIPT_DIR))

from fetch_crs import parse_excel_all_accepted, download_cr, parse_cr_cover, wsl_path
from hf_cr_index import load_hf_index, push_hf_index


def main():
    ap = argparse.ArgumentParser(
        description="Build and push CR index to HuggingFace dataset.",
    )
    ap.add_argument("excel_path", help="Path to .xls or .xlsx contribution list")
    ap.add_argument(
        "--output-dir",
        default=str(Path.home() / "CR_Processing"),
        help="Base output directory (default: ~/CR_Processing)",
    )
    ap.add_argument(
        "--hf-repo",
        default="OrganizedProgrammers/CR_Index",
        help="HuggingFace dataset repo (default: OrganizedProgrammers/CR_Index)",
    )
    args = ap.parse_args()

    eol_user = os.environ.get("EOL_USER", "")
    eol_password = os.environ.get("EOL_PASSWORD", "")
    hf_token = os.environ.get("HF_TOKEN", "")

    if not eol_user or not eol_password:
        sys.exit("ERROR: EOL_USER and EOL_PASSWORD must be set")
    if not hf_token:
        sys.exit("ERROR: HF_TOKEN must be set")

    excel_path = Path(wsl_path(args.excel_path))
    if not excel_path.exists():
        sys.exit(f"ERROR: Excel file not found: {excel_path}")

    output_dir = Path(wsl_path(args.output_dir)).expanduser()
    cr_dir = output_dir / "CRs"
    cr_dir.mkdir(parents=True, exist_ok=True)

    # ── 1. Compute Excel hash ─────────────────────────────────────────────────
    excel_hash = hashlib.sha256(excel_path.read_bytes()).hexdigest()[:16]
    meeting_label = excel_path.stem
    print(f"Excel:        {excel_path.name}")
    print(f"Excel hash:   {excel_hash}")
    print(f"Meeting:      {meeting_label}")
    print(f"HF repo:      {args.hf_repo}")
    print()

    # ── 2. Parse all Accepted CRs ─────────────────────────────────────────────
    print("Parsing Excel for all Accepted CRs...")
    try:
        cr_list = parse_excel_all_accepted(str(excel_path))
    except Exception as e:
        sys.exit(f"ERROR parsing Excel: {e}")
    print(f"Found {len(cr_list)} Accepted CR(s)\n")

    if not cr_list:
        print("Nothing to index.")
        sys.exit(0)

    # ── 3. Load existing HF index ─────────────────────────────────────────────
    print("Loading existing HF index...")
    try:
        existing = load_hf_index(hf_token, args.hf_repo)
    except Exception as e:
        print(f"  WARNING: could not load existing index: {e}")
        existing = []
    existing_keys = {(r["excel_hash"], r["uid"]) for r in existing}
    print(f"  {len(existing)} existing record(s), {len(existing_keys)} unique keys\n")

    # ── 4. Download and parse each new CR ─────────────────────────────────────
    new_records = []
    skipped = 0
    failed = []

    print("Processing CRs...")
    for uid, title, submitted_by in cr_list:
        if (excel_hash, uid) in existing_keys:
            print(f"  [{uid}] already indexed β€” skipping")
            skipped += 1
            continue

        # Retry loop (3 attempts)
        docx_path = None
        note = ""
        for attempt in range(1, 4):
            docx_path, note = download_cr(uid, cr_dir, eol_user, eol_password)
            if docx_path:
                break
            if attempt < 3:
                print(f"  [{uid}] attempt {attempt}/3 failed ({note}) β€” retrying in 5s")
                time.sleep(5)

        if not docx_path:
            print(f"  [{uid}] FAILED β€” {note}")
            failed.append((uid, note))
            continue

        spec_number, version = parse_cr_cover(docx_path)
        if not spec_number or not version:
            print(f"  [{uid}] WARNING: could not parse cover page β€” skipping")
            failed.append((uid, "cover page parse failed"))
            continue

        print(f"  [{uid}] -> TS {spec_number} v{version}")
        new_records.append({
            "excel_hash":    excel_hash,
            "meeting_label": meeting_label,
            "uid":           uid,
            "title":         title,
            "submitted_by":  submitted_by,
            "spec_number":   spec_number,
            "version":       version,
            "parsed_at":     datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
        })

    print()
    print(f"Summary: {len(new_records)} new, {skipped} skipped, {len(failed)} failed")
    if failed:
        print("Failed CRs:")
        for uid, reason in failed:
            print(f"  [{uid}] {reason}")
    print()

    # ── 5. Merge and push ─────────────────────────────────────────────────────
    if new_records:
        all_records = existing + new_records
        print(f"Pushing {len(all_records)} record(s) to {args.hf_repo}...")
        try:
            push_hf_index(all_records, hf_token, args.hf_repo)
            print("  Push successful")
        except Exception as e:
            sys.exit(f"ERROR pushing to HF: {e}")
    else:
        print("No new records to push.")

    # ── 6. Sentinel line (watched by app.py) ──────────────────────────────────
    print(f"INDEX_COMPLETE excel_hash={excel_hash}")


if __name__ == "__main__":
    main()