Spaces:

OrganizedProgrammers
/

ApplyCRs

Sleeping

App Files Files Community

heymenn commited on Apr 21

Commit

b5fc740

1 Parent(s): 8692cb7

reduce codebase size, prevent errors and unknowns and configure new ETSI download process

Browse files

Files changed (9) hide show

app.py +39 -96
scripts/cr_parser.py +20 -1
scripts/docx_helpers.py +5 -2
scripts/etsi_client.py +5 -4
scripts/fetch_crs.py +80 -122
scripts/finalize_ts.py +16 -4
scripts/hf_cr_index.py +15 -12
scripts/orchestrate_cr.py +179 -312
scripts/ts_applicator.py +66 -44

app.py CHANGED Viewed

@@ -264,6 +264,22 @@ def load_hf_index_cached(hf_token: str, hf_repo: str) -> list[dict]:
     return st.session_state[key]
 # ── Page config ───────────────────────────────────────────────────────────────
 st.set_page_config(
     page_title="CR Application Tool",
@@ -454,29 +470,11 @@ elif status == "upload":
                     env["EOL_PASSWORD"] = st.session_state.eol_password
                     # HF_TOKEN is already in env via os.environ
-                    log_file = open(index_log, "w")
-                    proc = subprocess.Popen(
-                        cmd,
-                        stdout=log_file,
-                        stderr=subprocess.STDOUT,
-                        env=env,
-                    )
-                    log_file.close()
-                    threading.Thread(
-                        target=_run_and_save_rc,
-                        args=(proc, rc_path),
-                        daemon=True,
-                    ).start()
-                    st.session_state.proc = proc
-                    state["status"]     = "indexing"
-                    state["pid"]        = proc.pid
-                    state["index_log"]  = index_log
-                    state["output_dir"] = ""   # no pipeline output yet
-                    state["started_at"] = datetime.now().isoformat()
-                    save_state(sid, state)
-                    st.rerun()
 # ════════════════════════════════════════════════════════════════════════════
 # INDEXING  (build_cr_index.py running)
@@ -588,31 +586,13 @@ elif status == "ts_select":
                     env["EOL_PASSWORD"] = st.session_state.eol_password
                     # HF_TOKEN already in env via os.environ
-                    log_file = open(str(log_path), "w")
-                    proc = subprocess.Popen(
-                        cmd,
-                        stdout=log_file,
-                        stderr=subprocess.STDOUT,
-                        env=env,
-                    )
-                    log_file.close()
-                    threading.Thread(
-                        target=_run_and_save_rc,
-                        args=(proc, rc_path),
-                        daemon=True,
-                    ).start()
-                    st.session_state.proc = proc
-                    state["ts_id"]          = selected_spec
-                    state["status"]         = "running"
-                    state["pid"]            = proc.pid
-                    state["output_dir"]     = str(output_dir)
-                    state["log_path"]       = str(log_path)
-                    state["run_log_paths"]  = [str(log_path)]
-                    state["started_at"]     = datetime.now().isoformat()
-                    save_state(sid, state)
-                    st.rerun()
 # ════════════════════════════════════════════════════════════════════════════
 # PREVIEW
@@ -671,32 +651,12 @@ elif status == "preview":
             env["EOL_USER"] = st.session_state.eol_user
             env["EOL_PASSWORD"] = st.session_state.eol_password
-            log_file = open(str(log_path), "w")
-            proc = subprocess.Popen(
-                cmd,
-                stdout=log_file,
-                stderr=subprocess.STDOUT,
-                env=env,
-            )
-            log_file.close()
-            # Background thread writes returncode file when process finishes
-            threading.Thread(
-                target=_run_and_save_rc,
-                args=(proc, rc_path),
-                daemon=True,
-            ).start()
-            st.session_state.proc = proc
-            state["status"] = "running"
-            state["pid"] = proc.pid
-            state["output_dir"] = str(output_dir)
-            state["log_path"] = str(log_path)
-            state["run_log_paths"] = [str(log_path)]
-            state["started_at"] = datetime.now().isoformat()
-            save_state(sid, state)
-            st.rerun()
 # ════════════════════════════════════════════════════════════════════════════
 # RUNNING
@@ -873,8 +833,6 @@ elif status in ("done", "error"):
                 if st.button("▶ Apply CRs to recovered TSs", type="primary"):
                     retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
-                    _rc_path(sid).unlink(missing_ok=True)   # clear old returncode
                     cmd = [
                         sys.executable,
                         str(SCRIPTS_DIR / "orchestrate_cr.py"),
@@ -885,26 +843,11 @@ elif status in ("done", "error"):
                     env["EOL_USER"]     = st.session_state.eol_user
                     env["EOL_PASSWORD"] = st.session_state.eol_password
-                    log_file = open(retry_log, "w")
-                    proc = subprocess.Popen(
-                        cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env
-                    )
-                    log_file.close()
-                    threading.Thread(
-                        target=_run_and_save_rc,
-                        args=(proc, _rc_path(sid)),
-                        daemon=True,
-                    ).start()
-                    st.session_state.proc = proc
-                    state["status"]          = "running"
-                    state["pid"]             = proc.pid
-                    state["log_path"]        = retry_log
-                    state["run_log_paths"]   = state.get("run_log_paths", []) + [retry_log]
-                    state["started_at"]      = datetime.now().isoformat()
-                    save_state(sid, state)
-                    st.rerun()
             else:
                 st.warning("No TSs available yet — retry download or upload DOCX files above.")

     return st.session_state[key]
+def _launch_proc(cmd, env, log_path, sid, state, extra_state: dict):
+    """Open log_path, Popen cmd, start rc-writer thread, update state, rerun."""
+    rc_path = _rc_path(sid)
+    rc_path.unlink(missing_ok=True)
+    log_file = open(str(log_path), "w")
+    proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)
+    log_file.close()
+    threading.Thread(target=_run_and_save_rc, args=(proc, rc_path), daemon=True).start()
+    st.session_state.proc = proc
+    state.update(extra_state)
+    state["pid"] = proc.pid
+    state["started_at"] = datetime.now().isoformat()
+    save_state(sid, state)
+    st.rerun()
 # ── Page config ───────────────────────────────────────────────────────────────
 st.set_page_config(
     page_title="CR Application Tool",
                     env["EOL_PASSWORD"] = st.session_state.eol_password
                     # HF_TOKEN is already in env via os.environ
+                    _launch_proc(cmd, env, index_log, sid, state, {
+                        "status":     "indexing",
+                        "index_log":  index_log,
+                        "output_dir": "",   # no pipeline output yet
+                    })
 # ════════════════════════════════════════════════════════════════════════════
 # INDEXING  (build_cr_index.py running)
                     env["EOL_PASSWORD"] = st.session_state.eol_password
                     # HF_TOKEN already in env via os.environ
+                    _launch_proc(cmd, env, log_path, sid, state, {
+                        "ts_id":         selected_spec,
+                        "status":        "running",
+                        "output_dir":    str(output_dir),
+                        "log_path":      str(log_path),
+                        "run_log_paths": [str(log_path)],
+                    })
 # ════════════════════════════════════════════════════════════════════════════
 # PREVIEW
             env["EOL_USER"] = st.session_state.eol_user
             env["EOL_PASSWORD"] = st.session_state.eol_password
+            _launch_proc(cmd, env, log_path, sid, state, {
+                "status":        "running",
+                "output_dir":    str(output_dir),
+                "log_path":      str(log_path),
+                "run_log_paths": [str(log_path)],
+            })
 # ════════════════════════════════════════════════════════════════════════════
 # RUNNING
                 if st.button("▶ Apply CRs to recovered TSs", type="primary"):
                     retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
                     cmd = [
                         sys.executable,
                         str(SCRIPTS_DIR / "orchestrate_cr.py"),
                     env["EOL_USER"]     = st.session_state.eol_user
                     env["EOL_PASSWORD"] = st.session_state.eol_password
+                    _launch_proc(cmd, env, retry_log, sid, state, {
+                        "status":        "running",
+                        "log_path":      retry_log,
+                        "run_log_paths": state.get("run_log_paths", []) + [retry_log],
+                    })
             else:
                 st.warning("No TSs available yet — retry download or upload DOCX files above.")

scripts/cr_parser.py CHANGED Viewed

@@ -301,7 +301,7 @@ def _parse_body(body, changes):
     sec_anchor = ''
     def flush_section():
-        nonlocal sec_state, sec_anchor
         if not sec_del and not sec_ins:
             sec_del.clear(); sec_sep.clear(); sec_ins.clear()
             sec_state = 'stable'
@@ -315,6 +315,17 @@ def _parse_body(body, changes):
                 if t:
                     del_heading = t
                     break
         # Serialize all elements for the manifest (del + sep + ins)
         all_elems = sec_del + sec_sep + sec_ins
         elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
@@ -332,6 +343,14 @@ def _parse_body(body, changes):
             },
             'elements_xml': elements_xml,
         })
         sec_del.clear(); sec_sep.clear(); sec_ins.clear()
         sec_state = 'stable'

     sec_anchor = ''
     def flush_section():
+        nonlocal sec_state, sec_anchor, prev_stable_text
         if not sec_del and not sec_ins:
             sec_del.clear(); sec_sep.clear(); sec_ins.clear()
             sec_state = 'stable'
                 if t:
                     del_heading = t
                     break
+        # Fallback: if first deleted element was a table, use its first cell text
+        if not del_heading:
+            for e in sec_del:
+                tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
+                if tag == 'tbl':
+                    first_tc = e.find('.//' + qn('w:tc'))
+                    if first_tc is not None:
+                        p = first_tc.find('.//' + qn('w:p'))
+                        del_heading = (_para_new_text(p) if p is not None
+                                       else _para_new_text(first_tc)).strip()
+                    break
         # Serialize all elements for the manifest (del + sep + ins)
         all_elems = sec_del + sec_sep + sec_ins
         elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
             },
             'elements_xml': elements_xml,
         })
+        # Refresh anchor so subsequent para_insert targets the new text, not the deleted one
+        if sec_ins:
+            last_p = next((e for e in reversed(sec_ins)
+                           if (e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'p'), None)
+            if last_p is not None:
+                candidate = _para_new_text(last_p).strip()
+                if candidate:
+                    prev_stable_text = candidate
         sec_del.clear(); sec_sep.clear(); sec_ins.clear()
         sec_state = 'stable'

scripts/docx_helpers.py CHANGED Viewed

@@ -75,7 +75,7 @@ def map_sections(doc, clause_numbers):
     for i, para in enumerate(doc.paragraphs):
         text  = para.text.strip()
-        style = para.style.name
         matched = False
         for clause in clause_numbers:
@@ -418,9 +418,12 @@ def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
     # Find the last row that contains at least one non-empty <w:t> node.
     # This skips pre-allocated blank rows at the table bottom.
     last_content_tr = all_trs[-1]
     for tr in reversed(all_trs):
-        if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))):
             last_content_tr = tr
             break

     for i, para in enumerate(doc.paragraphs):
         text  = para.text.strip()
+        style = para.style.name if para.style else 'Normal'
         matched = False
         for clause in clause_numbers:
     # Find the last row that contains at least one non-empty <w:t> node.
     # This skips pre-allocated blank rows at the table bottom.
+    _CONTENT_TAGS = {qn('w:t'), qn('w:hyperlink'), qn('w:drawing'), qn('w:object')}
     last_content_tr = all_trs[-1]
     for tr in reversed(all_trs):
+        has_text  = any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t')))
+        has_other = any(el.tag in _CONTENT_TAGS for el in tr.iter())
+        if has_text or has_other:
             last_content_tr = tr
             break

scripts/etsi_client.py CHANGED Viewed

@@ -267,8 +267,6 @@ class ETSISpecFinder:
             today = datetime.date.today().isoformat()
             base_params = {
-                "option":     "com_standardssearch",
-                "view":       "data",
                 "format":     "json",
                 "page":       "1",
                 "title":      "1",
@@ -304,9 +302,12 @@ class ETSISpecFinder:
                 params = {**base_params, "search": query}
                 try:
                     resp = requests.get(
-                        "https://www.etsi.org/",
                         params=params,
-                        headers=self.headers,
                         verify=False,
                         timeout=15,
                         proxies=_get_proxies(),

             today = datetime.date.today().isoformat()
             base_params = {
                 "format":     "json",
                 "page":       "1",
                 "title":      "1",
                 params = {**base_params, "search": query}
                 try:
                     resp = requests.get(
+                        "https://www.etsi.org/custom/standardssearch/data.php",
                         params=params,
+                        headers={
+                            **self.headers,
+                            "Referer": "https://www.etsi.org/standards/",
+                        },
                         verify=False,
                         timeout=15,
                         proxies=_get_proxies(),

scripts/fetch_crs.py CHANGED Viewed

@@ -67,9 +67,9 @@ def parse_excel_all_accepted(excel_path: str):
     ext = path.suffix.lower()
     if ext == ".xls":
-        return _parse_xls_all(path)
     elif ext == ".xlsx":
-        return _parse_xlsx_all(path)
     else:
         raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
@@ -78,7 +78,12 @@ def _name_pattern(name: str) -> re.Pattern:
     return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
-def _parse_xls(path: Path, person_name: str):
     try:
         import xlrd
     except ImportError:
@@ -101,12 +106,15 @@ def _parse_xls(path: Path, person_name: str):
     by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
     title_col = col.get("Title") or col.get("title")
-    for name, c in [("Uid", uid_col), ("Type", type_col),
-                    ("Status", status_col), ("SubmittedBy", by_col)]:
-        if c is None:
             raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
-    pattern = _name_pattern(person_name)
     results = []
     for r in range(2, ws.nrows):  # skip header + empty duplicate
@@ -120,15 +128,20 @@ def _parse_xls(path: Path, person_name: str):
             continue
         if status != "Accepted":
             continue
-        if not pattern.search(submitted_by):
             continue
-        results.append((uid, title))
     return results
-def _parse_xlsx(path: Path, person_name: str):
     try:
         import openpyxl
     except ImportError:
@@ -153,12 +166,15 @@ def _parse_xlsx(path: Path, person_name: str):
     by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
     title_col = col.get("Title") or col.get("title")
-    for name, c in [("Uid", uid_col), ("Type", type_col),
-                    ("Status", status_col), ("SubmittedBy", by_col)]:
-        if c is None:
             raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
-    pattern = _name_pattern(person_name)
     results = []
     for row in rows:
@@ -178,107 +194,10 @@ def _parse_xlsx(path: Path, person_name: str):
             continue
         if status != "Accepted":
             continue
-        if not pattern.search(submitted_by):
             continue
-        results.append((uid, title))
-    return results
-def _parse_xls_all(path: Path):
-    """Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
-    try:
-        import xlrd
-    except ImportError:
-        sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
-    wb = xlrd.open_workbook(str(path))
-    try:
-        ws = wb.sheet_by_name("Contributions")
-    except xlrd.XLRDError:
-        ws = wb.sheet_by_index(0)
-    headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
-    col = {h: i for i, h in enumerate(headers)}
-    uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
-    type_col = col.get("Type") or col.get("type")
-    status_col = col.get("Status") or col.get("status")
-    by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
-    title_col = col.get("Title") or col.get("title")
-    for name, c in [("Uid", uid_col), ("Type", type_col),
-                    ("Status", status_col), ("SubmittedBy", by_col)]:
-        if c is None:
-            raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
-    results = []
-    for r in range(2, ws.nrows):
-        uid = str(ws.cell_value(r, uid_col)).strip()
-        doc_type = str(ws.cell_value(r, type_col)).strip()
-        status = str(ws.cell_value(r, status_col)).strip()
-        submitted_by = str(ws.cell_value(r, by_col)).strip()
-        title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
-        if doc_type != "CR":
-            continue
-        if status != "Accepted":
-            continue
-        results.append((uid, title, submitted_by))
-    return results
-def _parse_xlsx_all(path: Path):
-    """Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
-    try:
-        import openpyxl
-    except ImportError:
-        sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
-    wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
-    ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
-    rows = iter(ws.iter_rows(values_only=True))
-    header_row = next(rows)
-    headers = [str(h).strip() if h is not None else "" for h in header_row]
-    col = {h: i for i, h in enumerate(headers)}
-    next(rows, None)  # skip empty duplicate row
-    uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
-    type_col = col.get("Type") or col.get("type")
-    status_col = col.get("Status") or col.get("status")
-    by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
-    title_col = col.get("Title") or col.get("title")
-    for name, c in [("Uid", uid_col), ("Type", type_col),
-                    ("Status", status_col), ("SubmittedBy", by_col)]:
-        if c is None:
-            raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
-    results = []
-    for row in rows:
-        def cell(c):
-            v = row[c] if c < len(row) else None
-            return str(v).strip() if v is not None else ""
-        uid = cell(uid_col)
-        doc_type = cell(type_col)
-        status = cell(status_col)
-        submitted_by = cell(by_col)
-        title = cell(title_col) if title_col is not None else ""
-        if not uid:
-            continue
-        if doc_type != "CR":
-            continue
-        if status != "Accepted":
-            continue
-        results.append((uid, title, submitted_by))
     return results
@@ -336,8 +255,29 @@ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
 # Step 3 — Parse CR Cover Pages
 # ---------------------------------------------------------------------------
-SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$")
-VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
 def parse_cr_cover(docx_path: Path):
@@ -360,7 +300,9 @@ def parse_cr_cover(docx_path: Path):
     if not doc.tables:
         return None, None
-    table = doc.tables[0]
     # Collect all non-empty cell texts in order
     cells = []
@@ -376,20 +318,36 @@ def parse_cr_cover(docx_path: Path):
     version = None
     for i, text in enumerate(cells):
-        # Look for spec number: "NNN NNN" pattern
-        if SPEC_PATTERN.match(text) and spec_number is None:
             spec_number = text
-        # Look for version: cell immediately after "Current version:"
         if "Current version:" in text and i + 1 < len(cells):
             candidate = cells[i + 1]
             if VERSION_PATTERN.match(candidate):
-                version = candidate
-        # Also accept "Current version" without colon
         if text in ("Current version:", "Current version") and version is None:
             if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
-                version = cells[i + 1]
     if spec_number:
         spec_number = spec_number.replace('\xa0', ' ').strip()

     ext = path.suffix.lower()
     if ext == ".xls":
+        return _parse_xls(path)
     elif ext == ".xlsx":
+        return _parse_xlsx(path)
     else:
         raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
     return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
+def _parse_xls(path: Path, person_name: str | None = None):
+    """
+    Return Accepted CRs from an .xls file.
+    If person_name is given: return (uid, title) filtered to that name.
+    If person_name is None: return (uid, title, submitted_by) for all.
+    """
     try:
         import xlrd
     except ImportError:
     by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
     title_col = col.get("Title") or col.get("title")
+    for name, c, required in [
+        ("Uid", uid_col, True), ("Type", type_col, True),
+        ("Status", status_col, True), ("SubmittedBy", by_col, True),
+        ("Title", title_col, False),
+    ]:
+        if c is None and required:
             raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
+    pattern = _name_pattern(person_name) if person_name else None
     results = []
     for r in range(2, ws.nrows):  # skip header + empty duplicate
             continue
         if status != "Accepted":
             continue
+        if pattern and not pattern.search(submitted_by):
             continue
+        results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
     return results
+def _parse_xlsx(path: Path, person_name: str | None = None):
+    """
+    Return Accepted CRs from an .xlsx file.
+    If person_name is given: return (uid, title) filtered to that name.
+    If person_name is None: return (uid, title, submitted_by) for all.
+    """
     try:
         import openpyxl
     except ImportError:
     by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
     title_col = col.get("Title") or col.get("title")
+    for name, c, required in [
+        ("Uid", uid_col, True), ("Type", type_col, True),
+        ("Status", status_col, True), ("SubmittedBy", by_col, True),
+        ("Title", title_col, False),
+    ]:
+        if c is None and required:
             raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
+    pattern = _name_pattern(person_name) if person_name else None
     results = []
     for row in rows:
             continue
         if status != "Accepted":
             continue
+        if pattern and not pattern.search(submitted_by):
             continue
+        results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
     return results
 # Step 3 — Parse CR Cover Pages
 # ---------------------------------------------------------------------------
+SPEC_PATTERN  = re.compile(r"^\d{3}\s\d{3}(-\d+)*$")   # "102 221" or "102 230-2"
+SPEC_SEARCH   = re.compile(r"\b\d{3}\s\d{3}(?:-\d+)*\b") # substring search fallback
+VERSION_PATTERN = re.compile(r"^[Vv]?\d+\.\d+(\.\d+)?$")  # X.Y or X.Y.Z, optional V prefix
+VERSION_SEARCH  = re.compile(r"\b\d+\.\d+\.\d+\b")         # substring fallback
+def _normalise_version(v: str) -> str:
+    """Strip optional V prefix and pad to X.Y.Z."""
+    v = v.lstrip('Vv')
+    parts = v.split('.')
+    while len(parts) < 3:
+        parts.append('0')
+    return '.'.join(parts[:3])
+def _find_cover_table(doc):
+    """Return the CR cover table, scanning all tables for one containing CHANGE REQUEST."""
+    MARKERS = {"CHANGE REQUEST", "CR", "CHANGE  REQUEST"}
+    for tbl in doc.tables:
+        cells_text = {c.text.strip() for row in tbl.rows for c in row.cells}
+        if cells_text & MARKERS:
+            return tbl
+    return None
 def parse_cr_cover(docx_path: Path):
     if not doc.tables:
         return None, None
+    table = _find_cover_table(doc)
+    if table is None:
+        return None, None
     # Collect all non-empty cell texts in order
     cells = []
     version = None
     for i, text in enumerate(cells):
+        # ── Strategy 1: exact cell match "NNN NNN" or "NNN NNN-N" ────────────
+        if spec_number is None and SPEC_PATTERN.match(text):
             spec_number = text
+        # ── Strategy 2: positional — cell immediately after "CHANGE REQUEST" ─
+        # The cover table always places the spec number in the cell right after
+        # the "CHANGE REQUEST" label.
+        if spec_number is None and text.strip() == "CHANGE REQUEST" and i + 1 < len(cells):
+            candidate = cells[i + 1].strip()
+            if SPEC_PATTERN.match(candidate):
+                spec_number = candidate
+        # ── Version: cell immediately after "Current version:" ───────────────
         if "Current version:" in text and i + 1 < len(cells):
             candidate = cells[i + 1]
             if VERSION_PATTERN.match(candidate):
+                version = _normalise_version(candidate)
         if text in ("Current version:", "Current version") and version is None:
             if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
+                version = _normalise_version(cells[i + 1])
+    # ── Strategy 3: substring search across all cells ─────────────────────────
+    # Catches cases where the spec number is embedded in a longer cell string.
+    if spec_number is None:
+        for text in cells:
+            m = SPEC_SEARCH.search(text)
+            if m:
+                spec_number = m.group(0)
+                break
     if spec_number:
         spec_number = spec_number.replace('\xa0', ' ').strip()

scripts/finalize_ts.py CHANGED Viewed

@@ -27,6 +27,7 @@ from docx_helpers import (
     AUTHOR,
     DATE,
 )
 # ── Path helpers ──────────────────────────────────────────────────────────────
@@ -58,6 +59,8 @@ def compute_pub_date():
 def derive_new_version(v: str) -> str:
     """Increment middle component of X.Y.Z → X.(Y+1).0."""
     parts = v.split('.')
     parts[1] = str(int(parts[1]) + 1)
     parts[2] = '0'
     return '.'.join(parts)
@@ -75,7 +78,9 @@ def extract_cr_metadata(cr_docx_path: str) -> dict:
     if not doc.tables:
         raise ValueError('CR has no tables — cannot extract metadata')
-    tbl = doc.tables[0]
     # Collect all cell texts for scanning
     cells = []
@@ -165,10 +170,16 @@ def _detect_meeting_separator(tbl):
     number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
     Returns the detected separator character, defaulting to '#'.
     """
     for row in reversed(tbl.rows):
         cells = row.cells
-        if len(cells) > 1:
-            text = cells[1].text.strip()
             if text:
                 m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
                 if m:
@@ -198,7 +209,8 @@ def find_change_history_table(ts_doc):
             continue
         if tbl.rows:
             header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
-            if any(kw in header_text for kw in ('cr', 'date', 'meeting', 'rev')):
                 return tbl
     raise NoChangeHistoryTable(
         'No Change History table found in this document '

     AUTHOR,
     DATE,
 )
+from fetch_crs import _find_cover_table
 # ── Path helpers ──────────────────────────────────────────────────────────────
 def derive_new_version(v: str) -> str:
     """Increment middle component of X.Y.Z → X.(Y+1).0."""
     parts = v.split('.')
+    if len(parts) < 3:
+        parts += ['0'] * (3 - len(parts))
     parts[1] = str(int(parts[1]) + 1)
     parts[2] = '0'
     return '.'.join(parts)
     if not doc.tables:
         raise ValueError('CR has no tables — cannot extract metadata')
+    tbl = _find_cover_table(doc)
+    if tbl is None:
+        raise ValueError('CR cover table not found — no table containing "CHANGE REQUEST"')
     # Collect all cell texts for scanning
     cells = []
     number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
     Returns the detected separator character, defaulting to '#'.
     """
+    meet_col = 1  # default: standard ETSI Change History has Meeting in col 1
+    if tbl.rows:
+        for c_idx, cell in enumerate(tbl.rows[0].cells):
+            if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
+                meet_col = c_idx
+                break
     for row in reversed(tbl.rows):
         cells = row.cells
+        if len(cells) > meet_col:
+            text = cells[meet_col].text.strip()
             if text:
                 m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
                 if m:
             continue
         if tbl.rows:
             header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
+            header_words = set(re.findall(r'\b\w+\b', header_text))
+            if {'cr', 'date'}.issubset(header_words):
                 return tbl
     raise NoChangeHistoryTable(
         'No Change History table found in this document '

scripts/hf_cr_index.py CHANGED Viewed

@@ -44,15 +44,18 @@ def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None:
         exist_ok=True,
         token=hf_token,
     )
-    tmp = Path("/tmp/cr_index.jsonl")
-    tmp.write_text(
-        "\n".join(json.dumps(r, ensure_ascii=False) for r in records),
-        encoding="utf-8",
-    )
-    api.upload_file(
-        path_or_fileobj=str(tmp),
-        path_in_repo="cr_index.jsonl",
-        repo_id=hf_repo,
-        repo_type="dataset",
-        token=hf_token,
-    )

         exist_ok=True,
         token=hf_token,
     )
+    import tempfile
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl',
+                                     encoding='utf-8', delete=False) as _f:
+        _f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records))
+        _tmp_path = _f.name
+    try:
+        api.upload_file(
+            path_or_fileobj=_tmp_path,
+            path_in_repo="cr_index.jsonl",
+            repo_id=hf_repo,
+            repo_type="dataset",
+            token=hf_token,
+        )
+    finally:
+        Path(_tmp_path).unlink(missing_ok=True)

scripts/orchestrate_cr.py CHANGED Viewed

@@ -81,6 +81,173 @@ class _TeeWriter:
         self._real.flush()
 # ── Shared Steps 2, 4, 5, 6 ──────────────────────────────────────────────────
 def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
@@ -182,144 +349,12 @@ def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
     report = []  # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
     for (spec_number, version), uids in ts_groups.items():
-        ts_key = f'TS {spec_number} v{version}'
         spec_compact = spec_number.replace(' ', '')
         spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
         spec_dir.mkdir(parents=True, exist_ok=True)
-        new_v = derive_new_version(version)
-        stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
-        ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
-        ts_final   = spec_dir / f'{stem}.docx'
-        log_path   = spec_dir / f'{stem}.log'
-        errors = []
-        print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
-        if (spec_number, version) not in ts_paths:
-            msg = 'TS download failed — skipping'
-            print(f'  SKIP: {msg}')
-            report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
-            continue
-        ts_in = ts_paths[(spec_number, version)]
-        log_buf = io.StringIO()
-        tee = _TeeWriter(sys.stdout, log_buf)
-        with contextlib.redirect_stdout(tee):
-            log_header = (
-                f'Pipeline Log\n'
-                f'TS:   {spec_number}  v{version} -> v{new_v}\n'
-                f'CRs:  {", ".join(uids)}\n'
-                f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
-                f'{"=" * 60}\n'
-            )
-            print(log_header, end='')
-            combined_manifest = []
-            participating_uids = []
-            for uid in uids:
-                if uid not in cr_paths:
-                    errors.append(f'[{uid}] CR download had failed — skipped')
-                    continue
-                print(f'  Parsing {uid}... ', end='', flush=True)
-                try:
-                    changes = parse_cr(cr_paths[uid])
-                    combined_manifest.extend(changes)
-                    participating_uids.append(uid)
-                    print(f'{len(changes)} change(s)')
-                except Exception as e:
-                    errors.append(f'[{uid}] parse ERROR: {e}')
-                    print(f'ERROR: {e}')
-            if not combined_manifest:
-                print('  No changes parsed — skipping apply step.')
-                report.append((ts_key, 0, 0, len(uids), None, log_path,
-                                errors + ['No changes parsed']))
-                log_path.write_text(log_buf.getvalue(), encoding='utf-8')
-                continue
-            print(f'  Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
-            try:
-                n_ok, n_skip, log_lines = apply_manifest(
-                    ts_in, combined_manifest, ts_applied, author=author, date=tc_date
-                )
-            except Exception as e:
-                errors.append(f'apply_manifest ERROR: {e}')
-                print(f'  ERROR: {e}')
-                report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
-                log_path.write_text(log_buf.getvalue(), encoding='utf-8')
-                continue
-            for line in log_lines:
-                print(f'  {line}')
-            for line in log_lines:
-                if line.strip().startswith('ERROR'):
-                    errors.append(line.strip())
-            print(f'  -> Applied: {n_ok}  Skipped: {n_skip}')
-            print('  Finalising metadata...')
-            try:
-                ts_doc = docx_lib.Document(str(ts_applied))
-                rev = RevCounter(ts_doc)
-                pub_ym, pub_month_year = compute_pub_date()
-                old_v = version
-                title_text = ts_doc.paragraphs[0].text
-                date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
-                old_date_str = date_match.group(1) if date_match else ''
-                print(f'    Version:     {old_v} -> {new_v}')
-                print(f'    Publication: {pub_month_year} ({pub_ym})')
-                for uid in participating_uids:
-                    try:
-                        meta = extract_cr_metadata(str(cr_paths[uid]))
-                        ch_cells = update_change_history_table(
-                            ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
-                        )
-                        print(f'    [Change History] {uid}: {ch_cells}')
-                    except NoChangeHistoryTable:
-                        print(f'    [Change History] {uid}: NOT PRESENT — this document has no Change History table (History table only)')
-                    except Exception as e:
-                        errors.append(f'[{uid}] Change History ERROR: {e}')
-                        print(f'    [Change History] {uid}: ERROR — {e}')
-                try:
-                    h_cells = update_history_table(
-                        ts_doc, new_v, pub_month_year, rev, author, tc_date
-                    )
-                    print(f'    [History] {h_cells}')
-                except Exception as e:
-                    errors.append(f'History table ERROR: {e}')
-                    print(f'    [History] ERROR — {e}')
-                if old_date_str:
-                    try:
-                        update_title_para(
-                            ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
-                        )
-                        print(f'    [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
-                    except Exception as e:
-                        errors.append(f'Title update ERROR: {e}')
-                        print(f'    [Title] ERROR — {e}')
-                else:
-                    print(f'    [Title] SKIP — no (YYYY-MM) pattern in: {title_text!r}')
-                ts_doc.save(str(ts_final))
-                print(f'  Saved: {spec_compact}/{ts_final.name}')
-                print(f'  Log:   {spec_compact}/{log_path.name}')
-                report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
-            except Exception as e:
-                errors.append(f'Finalisation ERROR: {e}')
-                print(f'  Finalisation ERROR: {e}')
-                report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
-        log_path.write_text(log_buf.getvalue(), encoding='utf-8')
     return report, cr_paths, ts_paths, spec_dirs
@@ -450,144 +485,13 @@ def main():
         report = []
         for (spec_number, version), uids in ts_groups.items():
-            ts_key = f'TS {spec_number} v{version}'
             spec_compact = spec_number.replace(' ', '')
             spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
             spec_dir.mkdir(parents=True, exist_ok=True)
-            new_v = derive_new_version(version)
-            stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
-            ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
-            ts_final   = spec_dir / f'{stem}.docx'
-            log_path   = spec_dir / f'{stem}.log'
-            errors = []
-            print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
-            if (spec_number, version) not in ts_paths:
-                msg = 'TS DOCX not on disk — skipping'
-                print(f'  SKIP: {msg}')
-                report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
-                continue
-            ts_in = ts_paths[(spec_number, version)]
-            log_buf = io.StringIO()
-            tee = _TeeWriter(sys.stdout, log_buf)
-            with contextlib.redirect_stdout(tee):
-                log_header = (
-                    f'Pipeline Log (retry)\n'
-                    f'TS:   {spec_number}  v{version} -> v{new_v}\n'
-                    f'CRs:  {", ".join(uids)}\n'
-                    f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
-                    f'{"=" * 60}\n'
-                )
-                print(log_header, end='')
-                combined_manifest = []
-                participating_uids = []
-                for uid in uids:
-                    if uid not in cr_paths:
-                        errors.append(f'[{uid}] CR DOCX not found — skipped')
-                        continue
-                    print(f'  Parsing {uid}... ', end='', flush=True)
-                    try:
-                        changes = parse_cr(cr_paths[uid])
-                        combined_manifest.extend(changes)
-                        participating_uids.append(uid)
-                        print(f'{len(changes)} change(s)')
-                    except Exception as e:
-                        errors.append(f'[{uid}] parse ERROR: {e}')
-                        print(f'ERROR: {e}')
-                if not combined_manifest:
-                    print('  No changes parsed — skipping apply step.')
-                    report.append((ts_key, 0, 0, len(uids), None, log_path,
-                                    errors + ['No changes parsed']))
-                    log_path.write_text(log_buf.getvalue(), encoding='utf-8')
-                    continue
-                print(f'  Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
-                try:
-                    n_ok, n_skip, log_lines = apply_manifest(
-                        ts_in, combined_manifest, ts_applied, author=author, date=tc_date
-                    )
-                except Exception as e:
-                    errors.append(f'apply_manifest ERROR: {e}')
-                    print(f'  ERROR: {e}')
-                    report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
-                    log_path.write_text(log_buf.getvalue(), encoding='utf-8')
-                    continue
-                for line in log_lines:
-                    print(f'  {line}')
-                for line in log_lines:
-                    if line.strip().startswith('ERROR'):
-                        errors.append(line.strip())
-                print(f'  -> Applied: {n_ok}  Skipped: {n_skip}')
-                print('  Finalising metadata...')
-                try:
-                    ts_doc = docx_lib.Document(str(ts_applied))
-                    rev = RevCounter(ts_doc)
-                    pub_ym, pub_month_year = compute_pub_date()
-                    old_v = version
-                    title_text = ts_doc.paragraphs[0].text
-                    date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
-                    old_date_str = date_match.group(1) if date_match else ''
-                    print(f'    Version:     {old_v} -> {new_v}')
-                    print(f'    Publication: {pub_month_year} ({pub_ym})')
-                    for uid in participating_uids:
-                        try:
-                            meta = extract_cr_metadata(str(cr_paths[uid]))
-                            ch_cells = update_change_history_table(
-                                ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
-                            )
-                            print(f'    [Change History] {uid}: {ch_cells}')
-                        except NoChangeHistoryTable:
-                            print(f'    [Change History] {uid}: NOT PRESENT — this document has no Change History table (History table only)')
-                        except Exception as e:
-                            errors.append(f'[{uid}] Change History ERROR: {e}')
-                            print(f'    [Change History] {uid}: ERROR — {e}')
-                    try:
-                        h_cells = update_history_table(
-                            ts_doc, new_v, pub_month_year, rev, author, tc_date
-                        )
-                        print(f'    [History] {h_cells}')
-                    except Exception as e:
-                        errors.append(f'History table ERROR: {e}')
-                        print(f'    [History] ERROR — {e}')
-                    if old_date_str:
-                        try:
-                            update_title_para(
-                                ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
-                            )
-                            print(f'    [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
-                        except Exception as e:
-                            errors.append(f'Title update ERROR: {e}')
-                            print(f'    [Title] ERROR — {e}')
-                    else:
-                        print(f'    [Title] SKIP — no (YYYY-MM) pattern in: {title_text!r}')
-                    ts_doc.save(str(ts_final))
-                    print(f'  Saved: {spec_compact}/{ts_final.name}')
-                    print(f'  Log:   {spec_compact}/{log_path.name}')
-                    report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
-                except Exception as e:
-                    errors.append(f'Finalisation ERROR: {e}')
-                    print(f'  Finalisation ERROR: {e}')
-                    report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
-            log_path.write_text(log_buf.getvalue(), encoding='utf-8')
         # Update failed_ts.json — remove entries that are now resolved
         still_failed = [
@@ -601,11 +505,7 @@ def main():
         n_partial  = sum(1 for r in report if r[4] is not None and r[6])
         n_failed   = sum(1 for r in report if r[4] is None)
         print(f'TSs processed:  {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
-        for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
-            status_tag = 'OK' if out_path and not errors else ('WARN' if out_path else 'SKIP')
-            print(f'  [{status_tag}] {ts_key}')
-            for err in errors:
-                print(f'         ! {err}')
         return
     # ── TS mode — load HF index, skip Steps 1 & 3 ────────────────────────────
@@ -656,12 +556,7 @@ def main():
         # Copy the CRs actually applied into the run output dir so the ZIP
         # contains exactly the CRs used for this TS (only needed when using
         # a shared CR cache that lives outside output_dir).
-        _run_cr_dir = output_dir / 'CRs'
-        if cr_dir.resolve() != _run_cr_dir.resolve():
-            _run_cr_dir.mkdir(parents=True, exist_ok=True)
-            for _p in cr_paths.values():
-                if _p.exists():
-                    shutil.copy2(_p, _run_cr_dir / _p.name)
         _section('Final Report (TS mode)')
         n_success = sum(1 for r in report if r[4] is not None and not r[6])
@@ -673,16 +568,7 @@ def main():
         print(f'TSs updated:  {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
         print()
-        for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
-            status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
-            print(f'  [{status}] {ts_key}')
-            print(f'         CRs: {n_crs}  |  Body changes applied: {n_ok}  |  Skipped: {n_skip}')
-            if out_path:
-                print(f'         Output: {out_path.parent.name}/{out_path.name}')
-            if log_path and log_path.exists():
-                print(f'         Log:    {log_path.parent.name}/{log_path.name}')
-            for err in errors:
-                print(f'         ! {err}')
         print()
         print(f'Output directory: {output_dir}/')
@@ -716,12 +602,7 @@ def main():
     # Copy the CRs actually applied into the run output dir so the ZIP
     # contains exactly the CRs used for this run (only needed when using
     # a shared CR cache that lives outside output_dir).
-    _run_cr_dir = output_dir / 'CRs'
-    if cr_dir.resolve() != _run_cr_dir.resolve():
-        _run_cr_dir.mkdir(parents=True, exist_ok=True)
-        for _p in cr_paths.values():
-            if _p.exists():
-                shutil.copy2(_p, _run_cr_dir / _p.name)
     # ── Final Report ──────────────────────────────────────────────────────────
     _section('Final Report')
@@ -735,21 +616,7 @@ def main():
     print(f'TSs updated:  {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
     print()
-    for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
-        if out_path and not errors:
-            status = 'OK'
-        elif out_path:
-            status = 'WARN'
-        else:
-            status = 'FAIL'
-        print(f'  [{status}] {ts_key}')
-        print(f'         CRs: {n_crs}  |  Body changes applied: {n_ok}  |  Skipped: {n_skip}')
-        if out_path:
-            print(f'         Output: {out_path.parent.name}/{out_path.name}')
-        if log_path and log_path.exists():
-            print(f'         Log:    {log_path.parent.name}/{log_path.name}')
-        for err in errors:
-            print(f'         ! {err}')
     print()
     print(f'Output directory: {output_dir}/')

         self._real.flush()
+# ── Small report / cache helpers ─────────────────────────────────────────────
+def _print_report(report, *, detailed=True):
+    """Print per-TS result lines from a report list."""
+    for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
+        status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
+        print(f'  [{status}] {ts_key}')
+        if detailed:
+            print(f'         CRs: {n_crs}  |  Body changes applied: {n_ok}  |  Skipped: {n_skip}')
+            if out_path:
+                print(f'         Output: {out_path.parent.name}/{out_path.name}')
+            if log_path and log_path.exists():
+                print(f'         Log:    {log_path.parent.name}/{log_path.name}')
+        for err in errors:
+            print(f'         ! {err}')
+def _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir):
+    """Copy downloaded CRs into output_dir/CRs when a shared cache is used."""
+    run_cr_dir = output_dir / 'CRs'
+    if cr_dir.resolve() != run_cr_dir.resolve():
+        run_cr_dir.mkdir(parents=True, exist_ok=True)
+        for p in cr_paths.values():
+            if p.exists():
+                shutil.copy2(p, run_cr_dir / p.name)
+# ── Per-TS-group apply helper ─────────────────────────────────────────────────
+def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
+                    author, tc_date, log_label='Pipeline Log'):
+    """Parse, apply, and finalise one TS group. Returns one report tuple."""
+    ts_key = f'TS {spec_number} v{version}'
+    spec_compact = spec_number.replace(' ', '')
+    new_v = derive_new_version(version)
+    stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
+    ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
+    ts_final   = spec_dir / f'{stem}.docx'
+    log_path   = spec_dir / f'{stem}.log'
+    errors = []
+    print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
+    if (spec_number, version) not in ts_paths:
+        msg = 'TS download failed — skipping'
+        print(f'  SKIP: {msg}')
+        return (ts_key, 0, 0, len(uids), None, log_path, [msg])
+    ts_in = ts_paths[(spec_number, version)]
+    log_buf = io.StringIO()
+    tee = _TeeWriter(sys.stdout, log_buf)
+    with contextlib.redirect_stdout(tee):
+        log_header = (
+            f'{log_label}\n'
+            f'TS:   {spec_number}  v{version} -> v{new_v}\n'
+            f'CRs:  {", ".join(uids)}\n'
+            f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
+            f'{"=" * 60}\n'
+        )
+        print(log_header, end='')
+        combined_manifest = []
+        participating_uids = []
+        for uid in uids:
+            if uid not in cr_paths:
+                errors.append(f'[{uid}] CR download had failed — skipped')
+                continue
+            print(f'  Parsing {uid}... ', end='', flush=True)
+            try:
+                changes = parse_cr(cr_paths[uid])
+                combined_manifest.extend(changes)
+                participating_uids.append(uid)
+                print(f'{len(changes)} change(s)')
+            except Exception as e:
+                errors.append(f'[{uid}] parse ERROR: {e}')
+                print(f'ERROR: {e}')
+        if not combined_manifest:
+            print('  No changes parsed — skipping apply step.')
+            log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+            return (ts_key, 0, 0, len(uids), None, log_path,
+                    errors + ['No changes parsed'])
+        print(f'  Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
+        try:
+            n_ok, n_skip, log_lines = apply_manifest(
+                ts_in, combined_manifest, ts_applied, author=author, date=tc_date
+            )
+        except Exception as e:
+            errors.append(f'apply_manifest ERROR: {e}')
+            print(f'  ERROR: {e}')
+            log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+            return (ts_key, 0, 0, len(uids), None, log_path, errors)
+        for line in log_lines:
+            print(f'  {line}')
+        for line in log_lines:
+            if line.strip().startswith('ERROR'):
+                errors.append(line.strip())
+        print(f'  -> Applied: {n_ok}  Skipped: {n_skip}')
+        print('  Finalising metadata...')
+        ts_final_or_applied = ts_applied  # fallback if finalise raises
+        try:
+            ts_doc = docx_lib.Document(str(ts_applied))
+            rev = RevCounter(ts_doc)
+            pub_ym, pub_month_year = compute_pub_date()
+            old_v = version
+            title_text = ts_doc.paragraphs[0].text
+            date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
+            old_date_str = date_match.group(1) if date_match else ''
+            print(f'    Version:     {old_v} -> {new_v}')
+            print(f'    Publication: {pub_month_year} ({pub_ym})')
+            for uid in participating_uids:
+                try:
+                    meta = extract_cr_metadata(str(cr_paths[uid]))
+                    ch_cells = update_change_history_table(
+                        ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
+                    )
+                    print(f'    [Change History] {uid}: {ch_cells}')
+                except NoChangeHistoryTable:
+                    print(f'    [Change History] {uid}: NOT PRESENT — this document has no Change History table (History table only)')
+                except Exception as e:
+                    errors.append(f'[{uid}] Change History ERROR: {e}')
+                    print(f'    [Change History] {uid}: ERROR — {e}')
+            try:
+                h_cells = update_history_table(
+                    ts_doc, new_v, pub_month_year, rev, author, tc_date
+                )
+                print(f'    [History] {h_cells}')
+            except Exception as e:
+                errors.append(f'History table ERROR: {e}')
+                print(f'    [History] ERROR — {e}')
+            if old_date_str:
+                try:
+                    update_title_para(
+                        ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
+                    )
+                    print(f'    [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
+                except Exception as e:
+                    errors.append(f'Title update ERROR: {e}')
+                    print(f'    [Title] ERROR — {e}')
+            else:
+                print(f'    [Title] SKIP — no (YYYY-MM) pattern in: {title_text!r}')
+            ts_doc.save(str(ts_final))
+            print(f'  Saved: {spec_compact}/{ts_final.name}')
+            print(f'  Log:   {spec_compact}/{log_path.name}')
+            ts_final_or_applied = ts_final
+        except Exception as e:
+            errors.append(f'Finalisation ERROR: {e}')
+            print(f'  Finalisation ERROR: {e}')
+    log_path.write_text(log_buf.getvalue(), encoding='utf-8')
+    return (ts_key, n_ok, n_skip, len(uids), ts_final_or_applied, log_path, errors)
 # ── Shared Steps 2, 4, 5, 6 ──────────────────────────────────────────────────
 def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
     report = []  # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
     for (spec_number, version), uids in ts_groups.items():
         spec_compact = spec_number.replace(' ', '')
         spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
         spec_dir.mkdir(parents=True, exist_ok=True)
+        report.append(_apply_ts_group(
+            spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date
+        ))
     return report, cr_paths, ts_paths, spec_dirs
         report = []
         for (spec_number, version), uids in ts_groups.items():
             spec_compact = spec_number.replace(' ', '')
             spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
             spec_dir.mkdir(parents=True, exist_ok=True)
+            report.append(_apply_ts_group(
+                spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date,
+                log_label='Pipeline Log (retry)'
+            ))
         # Update failed_ts.json — remove entries that are now resolved
         still_failed = [
         n_partial  = sum(1 for r in report if r[4] is not None and r[6])
         n_failed   = sum(1 for r in report if r[4] is None)
         print(f'TSs processed:  {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
+        _print_report(report, detailed=False)
         return
     # ── TS mode — load HF index, skip Steps 1 & 3 ────────────────────────────
         # Copy the CRs actually applied into the run output dir so the ZIP
         # contains exactly the CRs used for this TS (only needed when using
         # a shared CR cache that lives outside output_dir).
+        _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
         _section('Final Report (TS mode)')
         n_success = sum(1 for r in report if r[4] is not None and not r[6])
         print(f'TSs updated:  {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
         print()
+        _print_report(report)
         print()
         print(f'Output directory: {output_dir}/')
     # Copy the CRs actually applied into the run output dir so the ZIP
     # contains exactly the CRs used for this run (only needed when using
     # a shared CR cache that lives outside output_dir).
+    _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
     # ── Final Report ──────────────────────────────────────────────────────────
     _section('Final Report')
     print(f'TSs updated:  {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
     print()
+    _print_report(report)
     print()
     print(f'Output directory: {output_dir}/')

scripts/ts_applicator.py CHANGED Viewed

@@ -21,6 +21,10 @@ from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 sys.path.insert(0, str(Path(__file__).parent))
 from docx_helpers import (
     RevCounter,
     tracked_modify_para,
@@ -32,24 +36,29 @@ from docx_helpers import (
 # ── Text normalisation ────────────────────────────────────────────────────────
 def _norm(text):
     """Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
-    return (text
-            .replace('\xa0',   ' ')   # non-breaking space
-            .replace('\u202f', ' ')   # narrow no-break space
-            .replace('\u2007', ' ')   # figure space
-            .replace('\u2060', '')    # word joiner (invisible)
-            .replace('\u200b', '')    # zero-width space
-            .replace('\u00ad', '')    # soft hyphen (invisible)
-            .replace('\u2011', '-')   # non-breaking hyphen
-            .replace('\u2013', '-')   # en dash
-            .replace('\u2014', '-')   # em dash
-            .replace('\u2212', '-')   # minus sign
-            .replace('\u2018', "'")   # left single quote
-            .replace('\u2019', "'")   # right single quote
-            .replace('\u201c', '"')   # left double quote
-            .replace('\u201d', '"')   # right double quote
-            .strip())
 def _norm_ws(text):
@@ -70,22 +79,9 @@ def _norm_ws(text):
     Removing all whitespace from both sides before comparing solves this.
     Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
     """
-    base = (text
-            .replace('\xa0',   '')
-            .replace('\u202f', '')
-            .replace('\u2007', '')
-            .replace('\u2060', '')
-            .replace('\u200b', '')
-            .replace('\u00ad', '')
-            .replace('\u2011', '-')
-            .replace('\u2013', '-')
-            .replace('\u2014', '-')
-            .replace('\u2212', '-')
-            .replace('\u2018', "'")
-            .replace('\u2019', "'")
-            .replace('\u201c', '"')
-            .replace('\u201d', '"'))
-    return re.sub(r'\s+', '', base)
 def _norm_alnum(text):
@@ -225,14 +221,14 @@ def _find_table(doc, header_key):
     for tbl in doc.tables:
         if not tbl.rows:
             continue
-        first_row_texts = [_norm(c.text) for c in tbl.rows[0].cells]
-        # Match by prefix (header_key may have fewer columns)
-        match = all(
-            i < len(first_row_texts) and norm_key[i] in first_row_texts[i]
-            for i in range(len(norm_key))
-        )
-        if match:
-            return tbl, 1.0
     return None, 0.0
@@ -453,6 +449,20 @@ def _apply_section_replace(doc, change, rev, author, date, log):
                 break
             else:
                 break
     # ── Clone and remap IDs on the CR elements ─────────────────────────────────
     cloned = []
@@ -522,7 +532,10 @@ def _apply_text_replace(doc, change, rev, author, date, log):
             for para in cell.paragraphs:
                 if old in para.text:
                     tracked_modify_para(para, old, new, rev, author, date)
-                    log.append(f"  OK  text_replace (table_cell row={row_idx} col={col_idx}): {old!r} → {new!r}")
                     return True
             log.append(f"  ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
             return False
@@ -546,7 +559,11 @@ def _apply_text_replace(doc, change, rev, author, date, log):
                             tracked_modify_para(para, old, new, rev, author, date)
                             log.append(f"  OK  text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} → {new!r}")
                             return True
-            # Final fallback: scan ALL columns of ALL tables
             _all_start = tbl_by_section if tbl_by_section is not None else tbl
             for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
                 for r_idx, row in enumerate(search_tbl.rows):
@@ -554,7 +571,9 @@ def _apply_text_replace(doc, change, rev, author, date, log):
                         for para in cell.paragraphs:
                             if old in para.text:
                                 tracked_modify_para(para, old, new, rev, author, date)
-                                log.append(f"  OK  text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} → {new!r}")
                                 return True
             log.append(f"  ERROR text_replace: old text {old!r} not found in any table column")
             return False
@@ -606,6 +625,7 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
     tbl_by_section, _ = _find_table_by_section(doc, section_heading)
     if tbl_by_section is not None:
         tbl = tbl_by_section
     else:
         tbl, t_conf = _find_table(doc, loc['table_header'])
         if tbl is None:
@@ -636,7 +656,9 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
             last_inserted[key] = new_tr
     desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
-    log.append(f"  OK  row_insert after row[{row_idx}] ({after_anchor!r}): {desc!r}")
     return True

 from docx.oxml.ns import qn
 sys.path.insert(0, str(Path(__file__).parent))
+_MIN_LEN_ALLCOL_FALLBACK = 8  # old text shorter than this is too ambiguous for any-column search
+_WARN_CONF = 0.8   # confidence below this emits WARN instead of OK
 from docx_helpers import (
     RevCounter,
     tracked_modify_para,
 # ── Text normalisation ────────────────────────────────────────────────────────
+_UNICODE_REPLACEMENTS = (
+    ('\xa0',   ' '),  # non-breaking space
+    ('\u202f', ' '),  # narrow no-break space
+    ('\u2007', ' '),  # figure space
+    ('\u2060', ''),   # word joiner (invisible)
+    ('\u200b', ''),   # zero-width space
+    ('\u00ad', ''),   # soft hyphen (invisible)
+    ('\u2011', '-'),  # non-breaking hyphen
+    ('\u2013', '-'),  # en dash
+    ('\u2014', '-'),  # em dash
+    ('\u2212', '-'),  # minus sign
+    ('\u2018', "'"),  # left single quote
+    ('\u2019', "'"),  # right single quote
+    ('\u201c', '"'),  # left double quote
+    ('\u201d', '"'),  # right double quote
+)
 def _norm(text):
     """Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
+    for old, new in _UNICODE_REPLACEMENTS:
+        text = text.replace(old, new)
+    return text.strip()
 def _norm_ws(text):
     Removing all whitespace from both sides before comparing solves this.
     Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
     """
+    for old, new in _UNICODE_REPLACEMENTS:
+        text = text.replace(old, new)
+    return re.sub(r'\s+', '', text)
 def _norm_alnum(text):
     for tbl in doc.tables:
         if not tbl.rows:
             continue
+        for row in tbl.rows[:3]:   # check first 3 rows — header may not be row 0
+            row_texts = [_norm(c.text) for c in row.cells]
+            match = all(
+                i < len(row_texts) and norm_key[i] in row_texts[i]
+                for i in range(len(norm_key))
+            )
+            if match:
+                return tbl, 1.0
     return None, 0.0
                 break
             else:
                 break
+    # Validate the candidate table matches what the CR says should be deleted
+    if ts_tbl_elem is not None and elements_xml:
+        cr_tbl_xmls = [x for x in elements_xml if '<w:tbl' in x]
+        if cr_tbl_xmls:
+            from lxml import etree as _etree
+            cr_tbl_el = _etree.fromstring(cr_tbl_xmls[0].encode())
+            cr_hdr = ''.join(t.text or '' for t in
+                             cr_tbl_el.findall('.//' + qn('w:t'))[:10]).lower()
+            ts_hdr = ''.join(t.text or '' for t in
+                             ts_tbl_elem.findall('.//' + qn('w:t'))[:10]).lower()
+            if cr_hdr and cr_hdr not in ts_hdr and ts_hdr not in cr_hdr:
+                log.append('  WARN section_replace: candidate table header mismatch'
+                           ' — skipping table removal')
+                ts_tbl_elem = None
     # ── Clone and remap IDs on the CR elements ─────────────────────────────────
     cloned = []
             for para in cell.paragraphs:
                 if old in para.text:
                     tracked_modify_para(para, old, new, rev, author, date)
+                    _pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK  '
+                    log.append(f"  {_pfx} text_replace (table_cell"
+                               f" t_conf={t_conf:.1f} r_conf={r_conf:.1f}"
+                               f" row={row_idx} col={col_idx}): {old!r} → {new!r}")
                     return True
             log.append(f"  ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
             return False
                             tracked_modify_para(para, old, new, rev, author, date)
                             log.append(f"  OK  text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} → {new!r}")
                             return True
+            # Final fallback: scan ALL columns of ALL tables (guarded by min length)
+            if len(old) < _MIN_LEN_ALLCOL_FALLBACK:
+                log.append(f"  ERROR text_replace: {old!r} too short for all-column fallback"
+                           f" (ambiguous — skipped)")
+                return False
             _all_start = tbl_by_section if tbl_by_section is not None else tbl
             for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
                 for r_idx, row in enumerate(search_tbl.rows):
                         for para in cell.paragraphs:
                             if old in para.text:
                                 tracked_modify_para(para, old, new, rev, author, date)
+                                log.append(f"  WARN text_replace (table_cell any_col"
+                                           f" row={r_idx} col={c_idx} — low confidence):"
+                                           f" {old!r} → {new!r}")
                                 return True
             log.append(f"  ERROR text_replace: old text {old!r} not found in any table column")
             return False
     tbl_by_section, _ = _find_table_by_section(doc, section_heading)
     if tbl_by_section is not None:
         tbl = tbl_by_section
+        t_conf = 1.0
     else:
         tbl, t_conf = _find_table(doc, loc['table_header'])
         if tbl is None:
             last_inserted[key] = new_tr
     desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
+    _pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK  '
+    log.append(f"  {_pfx} row_insert (t_conf={t_conf:.1f} r_conf={r_conf:.1f})"
+               f" after row[{row_idx}] ({after_anchor!r}): {desc!r}")
     return True