Spaces:
Sleeping
Sleeping
reduce codebase size, prevent errors and unknowns and configure new ETSI download process
Browse files- app.py +39 -96
- scripts/cr_parser.py +20 -1
- scripts/docx_helpers.py +5 -2
- scripts/etsi_client.py +5 -4
- scripts/fetch_crs.py +80 -122
- scripts/finalize_ts.py +16 -4
- scripts/hf_cr_index.py +15 -12
- scripts/orchestrate_cr.py +179 -312
- scripts/ts_applicator.py +66 -44
app.py
CHANGED
|
@@ -264,6 +264,22 @@ def load_hf_index_cached(hf_token: str, hf_repo: str) -> list[dict]:
|
|
| 264 |
return st.session_state[key]
|
| 265 |
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 268 |
st.set_page_config(
|
| 269 |
page_title="CR Application Tool",
|
|
@@ -454,29 +470,11 @@ elif status == "upload":
|
|
| 454 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 455 |
# HF_TOKEN is already in env via os.environ
|
| 456 |
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
env=env,
|
| 463 |
-
)
|
| 464 |
-
log_file.close()
|
| 465 |
-
|
| 466 |
-
threading.Thread(
|
| 467 |
-
target=_run_and_save_rc,
|
| 468 |
-
args=(proc, rc_path),
|
| 469 |
-
daemon=True,
|
| 470 |
-
).start()
|
| 471 |
-
st.session_state.proc = proc
|
| 472 |
-
|
| 473 |
-
state["status"] = "indexing"
|
| 474 |
-
state["pid"] = proc.pid
|
| 475 |
-
state["index_log"] = index_log
|
| 476 |
-
state["output_dir"] = "" # no pipeline output yet
|
| 477 |
-
state["started_at"] = datetime.now().isoformat()
|
| 478 |
-
save_state(sid, state)
|
| 479 |
-
st.rerun()
|
| 480 |
|
| 481 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 482 |
# INDEXING (build_cr_index.py running)
|
|
@@ -588,31 +586,13 @@ elif status == "ts_select":
|
|
| 588 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 589 |
# HF_TOKEN already in env via os.environ
|
| 590 |
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
)
|
| 598 |
-
log_file.close()
|
| 599 |
-
|
| 600 |
-
threading.Thread(
|
| 601 |
-
target=_run_and_save_rc,
|
| 602 |
-
args=(proc, rc_path),
|
| 603 |
-
daemon=True,
|
| 604 |
-
).start()
|
| 605 |
-
st.session_state.proc = proc
|
| 606 |
-
|
| 607 |
-
state["ts_id"] = selected_spec
|
| 608 |
-
state["status"] = "running"
|
| 609 |
-
state["pid"] = proc.pid
|
| 610 |
-
state["output_dir"] = str(output_dir)
|
| 611 |
-
state["log_path"] = str(log_path)
|
| 612 |
-
state["run_log_paths"] = [str(log_path)]
|
| 613 |
-
state["started_at"] = datetime.now().isoformat()
|
| 614 |
-
save_state(sid, state)
|
| 615 |
-
st.rerun()
|
| 616 |
|
| 617 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 618 |
# PREVIEW
|
|
@@ -671,32 +651,12 @@ elif status == "preview":
|
|
| 671 |
env["EOL_USER"] = st.session_state.eol_user
|
| 672 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 673 |
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
)
|
| 681 |
-
log_file.close()
|
| 682 |
-
|
| 683 |
-
# Background thread writes returncode file when process finishes
|
| 684 |
-
threading.Thread(
|
| 685 |
-
target=_run_and_save_rc,
|
| 686 |
-
args=(proc, rc_path),
|
| 687 |
-
daemon=True,
|
| 688 |
-
).start()
|
| 689 |
-
|
| 690 |
-
st.session_state.proc = proc
|
| 691 |
-
|
| 692 |
-
state["status"] = "running"
|
| 693 |
-
state["pid"] = proc.pid
|
| 694 |
-
state["output_dir"] = str(output_dir)
|
| 695 |
-
state["log_path"] = str(log_path)
|
| 696 |
-
state["run_log_paths"] = [str(log_path)]
|
| 697 |
-
state["started_at"] = datetime.now().isoformat()
|
| 698 |
-
save_state(sid, state)
|
| 699 |
-
st.rerun()
|
| 700 |
|
| 701 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 702 |
# RUNNING
|
|
@@ -873,8 +833,6 @@ elif status in ("done", "error"):
|
|
| 873 |
|
| 874 |
if st.button("βΆ Apply CRs to recovered TSs", type="primary"):
|
| 875 |
retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
|
| 876 |
-
_rc_path(sid).unlink(missing_ok=True) # clear old returncode
|
| 877 |
-
|
| 878 |
cmd = [
|
| 879 |
sys.executable,
|
| 880 |
str(SCRIPTS_DIR / "orchestrate_cr.py"),
|
|
@@ -885,26 +843,11 @@ elif status in ("done", "error"):
|
|
| 885 |
env["EOL_USER"] = st.session_state.eol_user
|
| 886 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 887 |
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
threading.Thread(
|
| 895 |
-
target=_run_and_save_rc,
|
| 896 |
-
args=(proc, _rc_path(sid)),
|
| 897 |
-
daemon=True,
|
| 898 |
-
).start()
|
| 899 |
-
st.session_state.proc = proc
|
| 900 |
-
|
| 901 |
-
state["status"] = "running"
|
| 902 |
-
state["pid"] = proc.pid
|
| 903 |
-
state["log_path"] = retry_log
|
| 904 |
-
state["run_log_paths"] = state.get("run_log_paths", []) + [retry_log]
|
| 905 |
-
state["started_at"] = datetime.now().isoformat()
|
| 906 |
-
save_state(sid, state)
|
| 907 |
-
st.rerun()
|
| 908 |
else:
|
| 909 |
st.warning("No TSs available yet β retry download or upload DOCX files above.")
|
| 910 |
|
|
|
|
| 264 |
return st.session_state[key]
|
| 265 |
|
| 266 |
|
| 267 |
+
def _launch_proc(cmd, env, log_path, sid, state, extra_state: dict):
|
| 268 |
+
"""Open log_path, Popen cmd, start rc-writer thread, update state, rerun."""
|
| 269 |
+
rc_path = _rc_path(sid)
|
| 270 |
+
rc_path.unlink(missing_ok=True)
|
| 271 |
+
log_file = open(str(log_path), "w")
|
| 272 |
+
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)
|
| 273 |
+
log_file.close()
|
| 274 |
+
threading.Thread(target=_run_and_save_rc, args=(proc, rc_path), daemon=True).start()
|
| 275 |
+
st.session_state.proc = proc
|
| 276 |
+
state.update(extra_state)
|
| 277 |
+
state["pid"] = proc.pid
|
| 278 |
+
state["started_at"] = datetime.now().isoformat()
|
| 279 |
+
save_state(sid, state)
|
| 280 |
+
st.rerun()
|
| 281 |
+
|
| 282 |
+
|
| 283 |
# ββ Page config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 284 |
st.set_page_config(
|
| 285 |
page_title="CR Application Tool",
|
|
|
|
| 470 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 471 |
# HF_TOKEN is already in env via os.environ
|
| 472 |
|
| 473 |
+
_launch_proc(cmd, env, index_log, sid, state, {
|
| 474 |
+
"status": "indexing",
|
| 475 |
+
"index_log": index_log,
|
| 476 |
+
"output_dir": "", # no pipeline output yet
|
| 477 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 480 |
# INDEXING (build_cr_index.py running)
|
|
|
|
| 586 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 587 |
# HF_TOKEN already in env via os.environ
|
| 588 |
|
| 589 |
+
_launch_proc(cmd, env, log_path, sid, state, {
|
| 590 |
+
"ts_id": selected_spec,
|
| 591 |
+
"status": "running",
|
| 592 |
+
"output_dir": str(output_dir),
|
| 593 |
+
"log_path": str(log_path),
|
| 594 |
+
"run_log_paths": [str(log_path)],
|
| 595 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
|
| 597 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 598 |
# PREVIEW
|
|
|
|
| 651 |
env["EOL_USER"] = st.session_state.eol_user
|
| 652 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 653 |
|
| 654 |
+
_launch_proc(cmd, env, log_path, sid, state, {
|
| 655 |
+
"status": "running",
|
| 656 |
+
"output_dir": str(output_dir),
|
| 657 |
+
"log_path": str(log_path),
|
| 658 |
+
"run_log_paths": [str(log_path)],
|
| 659 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 662 |
# RUNNING
|
|
|
|
| 833 |
|
| 834 |
if st.button("βΆ Apply CRs to recovered TSs", type="primary"):
|
| 835 |
retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
|
|
|
|
|
|
|
| 836 |
cmd = [
|
| 837 |
sys.executable,
|
| 838 |
str(SCRIPTS_DIR / "orchestrate_cr.py"),
|
|
|
|
| 843 |
env["EOL_USER"] = st.session_state.eol_user
|
| 844 |
env["EOL_PASSWORD"] = st.session_state.eol_password
|
| 845 |
|
| 846 |
+
_launch_proc(cmd, env, retry_log, sid, state, {
|
| 847 |
+
"status": "running",
|
| 848 |
+
"log_path": retry_log,
|
| 849 |
+
"run_log_paths": state.get("run_log_paths", []) + [retry_log],
|
| 850 |
+
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
else:
|
| 852 |
st.warning("No TSs available yet β retry download or upload DOCX files above.")
|
| 853 |
|
scripts/cr_parser.py
CHANGED
|
@@ -301,7 +301,7 @@ def _parse_body(body, changes):
|
|
| 301 |
sec_anchor = ''
|
| 302 |
|
| 303 |
def flush_section():
|
| 304 |
-
nonlocal sec_state, sec_anchor
|
| 305 |
if not sec_del and not sec_ins:
|
| 306 |
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 307 |
sec_state = 'stable'
|
|
@@ -315,6 +315,17 @@ def _parse_body(body, changes):
|
|
| 315 |
if t:
|
| 316 |
del_heading = t
|
| 317 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
# Serialize all elements for the manifest (del + sep + ins)
|
| 319 |
all_elems = sec_del + sec_sep + sec_ins
|
| 320 |
elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
|
|
@@ -332,6 +343,14 @@ def _parse_body(body, changes):
|
|
| 332 |
},
|
| 333 |
'elements_xml': elements_xml,
|
| 334 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 336 |
sec_state = 'stable'
|
| 337 |
|
|
|
|
| 301 |
sec_anchor = ''
|
| 302 |
|
| 303 |
def flush_section():
|
| 304 |
+
nonlocal sec_state, sec_anchor, prev_stable_text
|
| 305 |
if not sec_del and not sec_ins:
|
| 306 |
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 307 |
sec_state = 'stable'
|
|
|
|
| 315 |
if t:
|
| 316 |
del_heading = t
|
| 317 |
break
|
| 318 |
+
# Fallback: if first deleted element was a table, use its first cell text
|
| 319 |
+
if not del_heading:
|
| 320 |
+
for e in sec_del:
|
| 321 |
+
tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
|
| 322 |
+
if tag == 'tbl':
|
| 323 |
+
first_tc = e.find('.//' + qn('w:tc'))
|
| 324 |
+
if first_tc is not None:
|
| 325 |
+
p = first_tc.find('.//' + qn('w:p'))
|
| 326 |
+
del_heading = (_para_new_text(p) if p is not None
|
| 327 |
+
else _para_new_text(first_tc)).strip()
|
| 328 |
+
break
|
| 329 |
# Serialize all elements for the manifest (del + sep + ins)
|
| 330 |
all_elems = sec_del + sec_sep + sec_ins
|
| 331 |
elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
|
|
|
|
| 343 |
},
|
| 344 |
'elements_xml': elements_xml,
|
| 345 |
})
|
| 346 |
+
# Refresh anchor so subsequent para_insert targets the new text, not the deleted one
|
| 347 |
+
if sec_ins:
|
| 348 |
+
last_p = next((e for e in reversed(sec_ins)
|
| 349 |
+
if (e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'p'), None)
|
| 350 |
+
if last_p is not None:
|
| 351 |
+
candidate = _para_new_text(last_p).strip()
|
| 352 |
+
if candidate:
|
| 353 |
+
prev_stable_text = candidate
|
| 354 |
sec_del.clear(); sec_sep.clear(); sec_ins.clear()
|
| 355 |
sec_state = 'stable'
|
| 356 |
|
scripts/docx_helpers.py
CHANGED
|
@@ -75,7 +75,7 @@ def map_sections(doc, clause_numbers):
|
|
| 75 |
|
| 76 |
for i, para in enumerate(doc.paragraphs):
|
| 77 |
text = para.text.strip()
|
| 78 |
-
style = para.style.name
|
| 79 |
|
| 80 |
matched = False
|
| 81 |
for clause in clause_numbers:
|
|
@@ -418,9 +418,12 @@ def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
|
|
| 418 |
|
| 419 |
# Find the last row that contains at least one non-empty <w:t> node.
|
| 420 |
# This skips pre-allocated blank rows at the table bottom.
|
|
|
|
| 421 |
last_content_tr = all_trs[-1]
|
| 422 |
for tr in reversed(all_trs):
|
| 423 |
-
|
|
|
|
|
|
|
| 424 |
last_content_tr = tr
|
| 425 |
break
|
| 426 |
|
|
|
|
| 75 |
|
| 76 |
for i, para in enumerate(doc.paragraphs):
|
| 77 |
text = para.text.strip()
|
| 78 |
+
style = para.style.name if para.style else 'Normal'
|
| 79 |
|
| 80 |
matched = False
|
| 81 |
for clause in clause_numbers:
|
|
|
|
| 418 |
|
| 419 |
# Find the last row that contains at least one non-empty <w:t> node.
|
| 420 |
# This skips pre-allocated blank rows at the table bottom.
|
| 421 |
+
_CONTENT_TAGS = {qn('w:t'), qn('w:hyperlink'), qn('w:drawing'), qn('w:object')}
|
| 422 |
last_content_tr = all_trs[-1]
|
| 423 |
for tr in reversed(all_trs):
|
| 424 |
+
has_text = any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t')))
|
| 425 |
+
has_other = any(el.tag in _CONTENT_TAGS for el in tr.iter())
|
| 426 |
+
if has_text or has_other:
|
| 427 |
last_content_tr = tr
|
| 428 |
break
|
| 429 |
|
scripts/etsi_client.py
CHANGED
|
@@ -267,8 +267,6 @@ class ETSISpecFinder:
|
|
| 267 |
today = datetime.date.today().isoformat()
|
| 268 |
|
| 269 |
base_params = {
|
| 270 |
-
"option": "com_standardssearch",
|
| 271 |
-
"view": "data",
|
| 272 |
"format": "json",
|
| 273 |
"page": "1",
|
| 274 |
"title": "1",
|
|
@@ -304,9 +302,12 @@ class ETSISpecFinder:
|
|
| 304 |
params = {**base_params, "search": query}
|
| 305 |
try:
|
| 306 |
resp = requests.get(
|
| 307 |
-
"https://www.etsi.org/",
|
| 308 |
params=params,
|
| 309 |
-
headers=
|
|
|
|
|
|
|
|
|
|
| 310 |
verify=False,
|
| 311 |
timeout=15,
|
| 312 |
proxies=_get_proxies(),
|
|
|
|
| 267 |
today = datetime.date.today().isoformat()
|
| 268 |
|
| 269 |
base_params = {
|
|
|
|
|
|
|
| 270 |
"format": "json",
|
| 271 |
"page": "1",
|
| 272 |
"title": "1",
|
|
|
|
| 302 |
params = {**base_params, "search": query}
|
| 303 |
try:
|
| 304 |
resp = requests.get(
|
| 305 |
+
"https://www.etsi.org/custom/standardssearch/data.php",
|
| 306 |
params=params,
|
| 307 |
+
headers={
|
| 308 |
+
**self.headers,
|
| 309 |
+
"Referer": "https://www.etsi.org/standards/",
|
| 310 |
+
},
|
| 311 |
verify=False,
|
| 312 |
timeout=15,
|
| 313 |
proxies=_get_proxies(),
|
scripts/fetch_crs.py
CHANGED
|
@@ -67,9 +67,9 @@ def parse_excel_all_accepted(excel_path: str):
|
|
| 67 |
ext = path.suffix.lower()
|
| 68 |
|
| 69 |
if ext == ".xls":
|
| 70 |
-
return
|
| 71 |
elif ext == ".xlsx":
|
| 72 |
-
return
|
| 73 |
else:
|
| 74 |
raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
|
| 75 |
|
|
@@ -78,7 +78,12 @@ def _name_pattern(name: str) -> re.Pattern:
|
|
| 78 |
return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
| 79 |
|
| 80 |
|
| 81 |
-
def _parse_xls(path: Path, person_name: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
try:
|
| 83 |
import xlrd
|
| 84 |
except ImportError:
|
|
@@ -101,12 +106,15 @@ def _parse_xls(path: Path, person_name: str):
|
|
| 101 |
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 102 |
title_col = col.get("Title") or col.get("title")
|
| 103 |
|
| 104 |
-
for name, c
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 108 |
|
| 109 |
-
pattern = _name_pattern(person_name)
|
| 110 |
results = []
|
| 111 |
|
| 112 |
for r in range(2, ws.nrows): # skip header + empty duplicate
|
|
@@ -120,15 +128,20 @@ def _parse_xls(path: Path, person_name: str):
|
|
| 120 |
continue
|
| 121 |
if status != "Accepted":
|
| 122 |
continue
|
| 123 |
-
if not pattern.search(submitted_by):
|
| 124 |
continue
|
| 125 |
|
| 126 |
-
results.append((uid, title))
|
| 127 |
|
| 128 |
return results
|
| 129 |
|
| 130 |
|
| 131 |
-
def _parse_xlsx(path: Path, person_name: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
try:
|
| 133 |
import openpyxl
|
| 134 |
except ImportError:
|
|
@@ -153,12 +166,15 @@ def _parse_xlsx(path: Path, person_name: str):
|
|
| 153 |
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 154 |
title_col = col.get("Title") or col.get("title")
|
| 155 |
|
| 156 |
-
for name, c
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
| 159 |
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 160 |
|
| 161 |
-
pattern = _name_pattern(person_name)
|
| 162 |
results = []
|
| 163 |
|
| 164 |
for row in rows:
|
|
@@ -178,107 +194,10 @@ def _parse_xlsx(path: Path, person_name: str):
|
|
| 178 |
continue
|
| 179 |
if status != "Accepted":
|
| 180 |
continue
|
| 181 |
-
if not pattern.search(submitted_by):
|
| 182 |
continue
|
| 183 |
|
| 184 |
-
results.append((uid, title))
|
| 185 |
-
|
| 186 |
-
return results
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
def _parse_xls_all(path: Path):
|
| 190 |
-
"""Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
|
| 191 |
-
try:
|
| 192 |
-
import xlrd
|
| 193 |
-
except ImportError:
|
| 194 |
-
sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
|
| 195 |
-
|
| 196 |
-
wb = xlrd.open_workbook(str(path))
|
| 197 |
-
try:
|
| 198 |
-
ws = wb.sheet_by_name("Contributions")
|
| 199 |
-
except xlrd.XLRDError:
|
| 200 |
-
ws = wb.sheet_by_index(0)
|
| 201 |
-
|
| 202 |
-
headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
|
| 203 |
-
col = {h: i for i, h in enumerate(headers)}
|
| 204 |
-
|
| 205 |
-
uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
|
| 206 |
-
type_col = col.get("Type") or col.get("type")
|
| 207 |
-
status_col = col.get("Status") or col.get("status")
|
| 208 |
-
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 209 |
-
title_col = col.get("Title") or col.get("title")
|
| 210 |
-
|
| 211 |
-
for name, c in [("Uid", uid_col), ("Type", type_col),
|
| 212 |
-
("Status", status_col), ("SubmittedBy", by_col)]:
|
| 213 |
-
if c is None:
|
| 214 |
-
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 215 |
-
|
| 216 |
-
results = []
|
| 217 |
-
for r in range(2, ws.nrows):
|
| 218 |
-
uid = str(ws.cell_value(r, uid_col)).strip()
|
| 219 |
-
doc_type = str(ws.cell_value(r, type_col)).strip()
|
| 220 |
-
status = str(ws.cell_value(r, status_col)).strip()
|
| 221 |
-
submitted_by = str(ws.cell_value(r, by_col)).strip()
|
| 222 |
-
title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
|
| 223 |
-
|
| 224 |
-
if doc_type != "CR":
|
| 225 |
-
continue
|
| 226 |
-
if status != "Accepted":
|
| 227 |
-
continue
|
| 228 |
-
|
| 229 |
-
results.append((uid, title, submitted_by))
|
| 230 |
-
|
| 231 |
-
return results
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
def _parse_xlsx_all(path: Path):
|
| 235 |
-
"""Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
|
| 236 |
-
try:
|
| 237 |
-
import openpyxl
|
| 238 |
-
except ImportError:
|
| 239 |
-
sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
|
| 240 |
-
|
| 241 |
-
wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
|
| 242 |
-
ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
|
| 243 |
-
|
| 244 |
-
rows = iter(ws.iter_rows(values_only=True))
|
| 245 |
-
header_row = next(rows)
|
| 246 |
-
headers = [str(h).strip() if h is not None else "" for h in header_row]
|
| 247 |
-
col = {h: i for i, h in enumerate(headers)}
|
| 248 |
-
|
| 249 |
-
next(rows, None) # skip empty duplicate row
|
| 250 |
-
|
| 251 |
-
uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
|
| 252 |
-
type_col = col.get("Type") or col.get("type")
|
| 253 |
-
status_col = col.get("Status") or col.get("status")
|
| 254 |
-
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 255 |
-
title_col = col.get("Title") or col.get("title")
|
| 256 |
-
|
| 257 |
-
for name, c in [("Uid", uid_col), ("Type", type_col),
|
| 258 |
-
("Status", status_col), ("SubmittedBy", by_col)]:
|
| 259 |
-
if c is None:
|
| 260 |
-
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 261 |
-
|
| 262 |
-
results = []
|
| 263 |
-
for row in rows:
|
| 264 |
-
def cell(c):
|
| 265 |
-
v = row[c] if c < len(row) else None
|
| 266 |
-
return str(v).strip() if v is not None else ""
|
| 267 |
-
|
| 268 |
-
uid = cell(uid_col)
|
| 269 |
-
doc_type = cell(type_col)
|
| 270 |
-
status = cell(status_col)
|
| 271 |
-
submitted_by = cell(by_col)
|
| 272 |
-
title = cell(title_col) if title_col is not None else ""
|
| 273 |
-
|
| 274 |
-
if not uid:
|
| 275 |
-
continue
|
| 276 |
-
if doc_type != "CR":
|
| 277 |
-
continue
|
| 278 |
-
if status != "Accepted":
|
| 279 |
-
continue
|
| 280 |
-
|
| 281 |
-
results.append((uid, title, submitted_by))
|
| 282 |
|
| 283 |
return results
|
| 284 |
|
|
@@ -336,8 +255,29 @@ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
|
|
| 336 |
# Step 3 β Parse CR Cover Pages
|
| 337 |
# ---------------------------------------------------------------------------
|
| 338 |
|
| 339 |
-
SPEC_PATTERN
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
|
| 343 |
def parse_cr_cover(docx_path: Path):
|
|
@@ -360,7 +300,9 @@ def parse_cr_cover(docx_path: Path):
|
|
| 360 |
if not doc.tables:
|
| 361 |
return None, None
|
| 362 |
|
| 363 |
-
table = doc
|
|
|
|
|
|
|
| 364 |
|
| 365 |
# Collect all non-empty cell texts in order
|
| 366 |
cells = []
|
|
@@ -376,20 +318,36 @@ def parse_cr_cover(docx_path: Path):
|
|
| 376 |
version = None
|
| 377 |
|
| 378 |
for i, text in enumerate(cells):
|
| 379 |
-
#
|
| 380 |
-
if SPEC_PATTERN.match(text)
|
| 381 |
spec_number = text
|
| 382 |
|
| 383 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
if "Current version:" in text and i + 1 < len(cells):
|
| 385 |
candidate = cells[i + 1]
|
| 386 |
if VERSION_PATTERN.match(candidate):
|
| 387 |
-
version = candidate
|
| 388 |
|
| 389 |
-
# Also accept "Current version" without colon
|
| 390 |
if text in ("Current version:", "Current version") and version is None:
|
| 391 |
if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
|
| 392 |
-
version = cells[i + 1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
if spec_number:
|
| 395 |
spec_number = spec_number.replace('\xa0', ' ').strip()
|
|
|
|
| 67 |
ext = path.suffix.lower()
|
| 68 |
|
| 69 |
if ext == ".xls":
|
| 70 |
+
return _parse_xls(path)
|
| 71 |
elif ext == ".xlsx":
|
| 72 |
+
return _parse_xlsx(path)
|
| 73 |
else:
|
| 74 |
raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
|
| 75 |
|
|
|
|
| 78 |
return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
| 79 |
|
| 80 |
|
| 81 |
+
def _parse_xls(path: Path, person_name: str | None = None):
|
| 82 |
+
"""
|
| 83 |
+
Return Accepted CRs from an .xls file.
|
| 84 |
+
If person_name is given: return (uid, title) filtered to that name.
|
| 85 |
+
If person_name is None: return (uid, title, submitted_by) for all.
|
| 86 |
+
"""
|
| 87 |
try:
|
| 88 |
import xlrd
|
| 89 |
except ImportError:
|
|
|
|
| 106 |
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 107 |
title_col = col.get("Title") or col.get("title")
|
| 108 |
|
| 109 |
+
for name, c, required in [
|
| 110 |
+
("Uid", uid_col, True), ("Type", type_col, True),
|
| 111 |
+
("Status", status_col, True), ("SubmittedBy", by_col, True),
|
| 112 |
+
("Title", title_col, False),
|
| 113 |
+
]:
|
| 114 |
+
if c is None and required:
|
| 115 |
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 116 |
|
| 117 |
+
pattern = _name_pattern(person_name) if person_name else None
|
| 118 |
results = []
|
| 119 |
|
| 120 |
for r in range(2, ws.nrows): # skip header + empty duplicate
|
|
|
|
| 128 |
continue
|
| 129 |
if status != "Accepted":
|
| 130 |
continue
|
| 131 |
+
if pattern and not pattern.search(submitted_by):
|
| 132 |
continue
|
| 133 |
|
| 134 |
+
results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
|
| 135 |
|
| 136 |
return results
|
| 137 |
|
| 138 |
|
| 139 |
+
def _parse_xlsx(path: Path, person_name: str | None = None):
|
| 140 |
+
"""
|
| 141 |
+
Return Accepted CRs from an .xlsx file.
|
| 142 |
+
If person_name is given: return (uid, title) filtered to that name.
|
| 143 |
+
If person_name is None: return (uid, title, submitted_by) for all.
|
| 144 |
+
"""
|
| 145 |
try:
|
| 146 |
import openpyxl
|
| 147 |
except ImportError:
|
|
|
|
| 166 |
by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
|
| 167 |
title_col = col.get("Title") or col.get("title")
|
| 168 |
|
| 169 |
+
for name, c, required in [
|
| 170 |
+
("Uid", uid_col, True), ("Type", type_col, True),
|
| 171 |
+
("Status", status_col, True), ("SubmittedBy", by_col, True),
|
| 172 |
+
("Title", title_col, False),
|
| 173 |
+
]:
|
| 174 |
+
if c is None and required:
|
| 175 |
raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
|
| 176 |
|
| 177 |
+
pattern = _name_pattern(person_name) if person_name else None
|
| 178 |
results = []
|
| 179 |
|
| 180 |
for row in rows:
|
|
|
|
| 194 |
continue
|
| 195 |
if status != "Accepted":
|
| 196 |
continue
|
| 197 |
+
if pattern and not pattern.search(submitted_by):
|
| 198 |
continue
|
| 199 |
|
| 200 |
+
results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
return results
|
| 203 |
|
|
|
|
| 255 |
# Step 3 β Parse CR Cover Pages
|
| 256 |
# ---------------------------------------------------------------------------
|
| 257 |
|
| 258 |
+
SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}(-\d+)*$") # "102 221" or "102 230-2"
|
| 259 |
+
SPEC_SEARCH = re.compile(r"\b\d{3}\s\d{3}(?:-\d+)*\b") # substring search fallback
|
| 260 |
+
VERSION_PATTERN = re.compile(r"^[Vv]?\d+\.\d+(\.\d+)?$") # X.Y or X.Y.Z, optional V prefix
|
| 261 |
+
VERSION_SEARCH = re.compile(r"\b\d+\.\d+\.\d+\b") # substring fallback
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _normalise_version(v: str) -> str:
|
| 265 |
+
"""Strip optional V prefix and pad to X.Y.Z."""
|
| 266 |
+
v = v.lstrip('Vv')
|
| 267 |
+
parts = v.split('.')
|
| 268 |
+
while len(parts) < 3:
|
| 269 |
+
parts.append('0')
|
| 270 |
+
return '.'.join(parts[:3])
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _find_cover_table(doc):
|
| 274 |
+
"""Return the CR cover table, scanning all tables for one containing CHANGE REQUEST."""
|
| 275 |
+
MARKERS = {"CHANGE REQUEST", "CR", "CHANGE REQUEST"}
|
| 276 |
+
for tbl in doc.tables:
|
| 277 |
+
cells_text = {c.text.strip() for row in tbl.rows for c in row.cells}
|
| 278 |
+
if cells_text & MARKERS:
|
| 279 |
+
return tbl
|
| 280 |
+
return None
|
| 281 |
|
| 282 |
|
| 283 |
def parse_cr_cover(docx_path: Path):
|
|
|
|
| 300 |
if not doc.tables:
|
| 301 |
return None, None
|
| 302 |
|
| 303 |
+
table = _find_cover_table(doc)
|
| 304 |
+
if table is None:
|
| 305 |
+
return None, None
|
| 306 |
|
| 307 |
# Collect all non-empty cell texts in order
|
| 308 |
cells = []
|
|
|
|
| 318 |
version = None
|
| 319 |
|
| 320 |
for i, text in enumerate(cells):
|
| 321 |
+
# ββ Strategy 1: exact cell match "NNN NNN" or "NNN NNN-N" ββββββββββββ
|
| 322 |
+
if spec_number is None and SPEC_PATTERN.match(text):
|
| 323 |
spec_number = text
|
| 324 |
|
| 325 |
+
# ββ Strategy 2: positional β cell immediately after "CHANGE REQUEST" β
|
| 326 |
+
# The cover table always places the spec number in the cell right after
|
| 327 |
+
# the "CHANGE REQUEST" label.
|
| 328 |
+
if spec_number is None and text.strip() == "CHANGE REQUEST" and i + 1 < len(cells):
|
| 329 |
+
candidate = cells[i + 1].strip()
|
| 330 |
+
if SPEC_PATTERN.match(candidate):
|
| 331 |
+
spec_number = candidate
|
| 332 |
+
|
| 333 |
+
# ββ Version: cell immediately after "Current version:" βββββββββββββββ
|
| 334 |
if "Current version:" in text and i + 1 < len(cells):
|
| 335 |
candidate = cells[i + 1]
|
| 336 |
if VERSION_PATTERN.match(candidate):
|
| 337 |
+
version = _normalise_version(candidate)
|
| 338 |
|
|
|
|
| 339 |
if text in ("Current version:", "Current version") and version is None:
|
| 340 |
if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
|
| 341 |
+
version = _normalise_version(cells[i + 1])
|
| 342 |
+
|
| 343 |
+
# ββ Strategy 3: substring search across all cells βββββββββββββββββββββββββ
|
| 344 |
+
# Catches cases where the spec number is embedded in a longer cell string.
|
| 345 |
+
if spec_number is None:
|
| 346 |
+
for text in cells:
|
| 347 |
+
m = SPEC_SEARCH.search(text)
|
| 348 |
+
if m:
|
| 349 |
+
spec_number = m.group(0)
|
| 350 |
+
break
|
| 351 |
|
| 352 |
if spec_number:
|
| 353 |
spec_number = spec_number.replace('\xa0', ' ').strip()
|
scripts/finalize_ts.py
CHANGED
|
@@ -27,6 +27,7 @@ from docx_helpers import (
|
|
| 27 |
AUTHOR,
|
| 28 |
DATE,
|
| 29 |
)
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
# ββ Path helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -58,6 +59,8 @@ def compute_pub_date():
|
|
| 58 |
def derive_new_version(v: str) -> str:
|
| 59 |
"""Increment middle component of X.Y.Z β X.(Y+1).0."""
|
| 60 |
parts = v.split('.')
|
|
|
|
|
|
|
| 61 |
parts[1] = str(int(parts[1]) + 1)
|
| 62 |
parts[2] = '0'
|
| 63 |
return '.'.join(parts)
|
|
@@ -75,7 +78,9 @@ def extract_cr_metadata(cr_docx_path: str) -> dict:
|
|
| 75 |
if not doc.tables:
|
| 76 |
raise ValueError('CR has no tables β cannot extract metadata')
|
| 77 |
|
| 78 |
-
tbl = doc
|
|
|
|
|
|
|
| 79 |
|
| 80 |
# Collect all cell texts for scanning
|
| 81 |
cells = []
|
|
@@ -165,10 +170,16 @@ def _detect_meeting_separator(tbl):
|
|
| 165 |
number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
|
| 166 |
Returns the detected separator character, defaulting to '#'.
|
| 167 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
for row in reversed(tbl.rows):
|
| 169 |
cells = row.cells
|
| 170 |
-
if len(cells) >
|
| 171 |
-
text = cells[
|
| 172 |
if text:
|
| 173 |
m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
|
| 174 |
if m:
|
|
@@ -198,7 +209,8 @@ def find_change_history_table(ts_doc):
|
|
| 198 |
continue
|
| 199 |
if tbl.rows:
|
| 200 |
header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
|
| 201 |
-
|
|
|
|
| 202 |
return tbl
|
| 203 |
raise NoChangeHistoryTable(
|
| 204 |
'No Change History table found in this document '
|
|
|
|
| 27 |
AUTHOR,
|
| 28 |
DATE,
|
| 29 |
)
|
| 30 |
+
from fetch_crs import _find_cover_table
|
| 31 |
|
| 32 |
|
| 33 |
# ββ Path helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 59 |
def derive_new_version(v: str) -> str:
|
| 60 |
"""Increment middle component of X.Y.Z β X.(Y+1).0."""
|
| 61 |
parts = v.split('.')
|
| 62 |
+
if len(parts) < 3:
|
| 63 |
+
parts += ['0'] * (3 - len(parts))
|
| 64 |
parts[1] = str(int(parts[1]) + 1)
|
| 65 |
parts[2] = '0'
|
| 66 |
return '.'.join(parts)
|
|
|
|
| 78 |
if not doc.tables:
|
| 79 |
raise ValueError('CR has no tables β cannot extract metadata')
|
| 80 |
|
| 81 |
+
tbl = _find_cover_table(doc)
|
| 82 |
+
if tbl is None:
|
| 83 |
+
raise ValueError('CR cover table not found β no table containing "CHANGE REQUEST"')
|
| 84 |
|
| 85 |
# Collect all cell texts for scanning
|
| 86 |
cells = []
|
|
|
|
| 170 |
number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
|
| 171 |
Returns the detected separator character, defaulting to '#'.
|
| 172 |
"""
|
| 173 |
+
meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
|
| 174 |
+
if tbl.rows:
|
| 175 |
+
for c_idx, cell in enumerate(tbl.rows[0].cells):
|
| 176 |
+
if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
|
| 177 |
+
meet_col = c_idx
|
| 178 |
+
break
|
| 179 |
for row in reversed(tbl.rows):
|
| 180 |
cells = row.cells
|
| 181 |
+
if len(cells) > meet_col:
|
| 182 |
+
text = cells[meet_col].text.strip()
|
| 183 |
if text:
|
| 184 |
m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
|
| 185 |
if m:
|
|
|
|
| 209 |
continue
|
| 210 |
if tbl.rows:
|
| 211 |
header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
|
| 212 |
+
header_words = set(re.findall(r'\b\w+\b', header_text))
|
| 213 |
+
if {'cr', 'date'}.issubset(header_words):
|
| 214 |
return tbl
|
| 215 |
raise NoChangeHistoryTable(
|
| 216 |
'No Change History table found in this document '
|
scripts/hf_cr_index.py
CHANGED
|
@@ -44,15 +44,18 @@ def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None:
|
|
| 44 |
exist_ok=True,
|
| 45 |
token=hf_token,
|
| 46 |
)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
exist_ok=True,
|
| 45 |
token=hf_token,
|
| 46 |
)
|
| 47 |
+
import tempfile
|
| 48 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl',
|
| 49 |
+
encoding='utf-8', delete=False) as _f:
|
| 50 |
+
_f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records))
|
| 51 |
+
_tmp_path = _f.name
|
| 52 |
+
try:
|
| 53 |
+
api.upload_file(
|
| 54 |
+
path_or_fileobj=_tmp_path,
|
| 55 |
+
path_in_repo="cr_index.jsonl",
|
| 56 |
+
repo_id=hf_repo,
|
| 57 |
+
repo_type="dataset",
|
| 58 |
+
token=hf_token,
|
| 59 |
+
)
|
| 60 |
+
finally:
|
| 61 |
+
Path(_tmp_path).unlink(missing_ok=True)
|
scripts/orchestrate_cr.py
CHANGED
|
@@ -81,6 +81,173 @@ class _TeeWriter:
|
|
| 81 |
self._real.flush()
|
| 82 |
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# ββ Shared Steps 2, 4, 5, 6 ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
|
| 86 |
def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
|
|
@@ -182,144 +349,12 @@ def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
|
|
| 182 |
report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
|
| 183 |
|
| 184 |
for (spec_number, version), uids in ts_groups.items():
|
| 185 |
-
ts_key = f'TS {spec_number} v{version}'
|
| 186 |
spec_compact = spec_number.replace(' ', '')
|
| 187 |
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 188 |
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
|
| 193 |
-
ts_final = spec_dir / f'{stem}.docx'
|
| 194 |
-
log_path = spec_dir / f'{stem}.log'
|
| 195 |
-
errors = []
|
| 196 |
-
|
| 197 |
-
print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
|
| 198 |
-
|
| 199 |
-
if (spec_number, version) not in ts_paths:
|
| 200 |
-
msg = 'TS download failed β skipping'
|
| 201 |
-
print(f' SKIP: {msg}')
|
| 202 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
|
| 203 |
-
continue
|
| 204 |
-
|
| 205 |
-
ts_in = ts_paths[(spec_number, version)]
|
| 206 |
-
|
| 207 |
-
log_buf = io.StringIO()
|
| 208 |
-
tee = _TeeWriter(sys.stdout, log_buf)
|
| 209 |
-
|
| 210 |
-
with contextlib.redirect_stdout(tee):
|
| 211 |
-
log_header = (
|
| 212 |
-
f'Pipeline Log\n'
|
| 213 |
-
f'TS: {spec_number} v{version} -> v{new_v}\n'
|
| 214 |
-
f'CRs: {", ".join(uids)}\n'
|
| 215 |
-
f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
|
| 216 |
-
f'{"=" * 60}\n'
|
| 217 |
-
)
|
| 218 |
-
print(log_header, end='')
|
| 219 |
-
|
| 220 |
-
combined_manifest = []
|
| 221 |
-
participating_uids = []
|
| 222 |
-
|
| 223 |
-
for uid in uids:
|
| 224 |
-
if uid not in cr_paths:
|
| 225 |
-
errors.append(f'[{uid}] CR download had failed β skipped')
|
| 226 |
-
continue
|
| 227 |
-
print(f' Parsing {uid}... ', end='', flush=True)
|
| 228 |
-
try:
|
| 229 |
-
changes = parse_cr(cr_paths[uid])
|
| 230 |
-
combined_manifest.extend(changes)
|
| 231 |
-
participating_uids.append(uid)
|
| 232 |
-
print(f'{len(changes)} change(s)')
|
| 233 |
-
except Exception as e:
|
| 234 |
-
errors.append(f'[{uid}] parse ERROR: {e}')
|
| 235 |
-
print(f'ERROR: {e}')
|
| 236 |
-
|
| 237 |
-
if not combined_manifest:
|
| 238 |
-
print(' No changes parsed β skipping apply step.')
|
| 239 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path,
|
| 240 |
-
errors + ['No changes parsed']))
|
| 241 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 242 |
-
continue
|
| 243 |
-
|
| 244 |
-
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 245 |
-
try:
|
| 246 |
-
n_ok, n_skip, log_lines = apply_manifest(
|
| 247 |
-
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 248 |
-
)
|
| 249 |
-
except Exception as e:
|
| 250 |
-
errors.append(f'apply_manifest ERROR: {e}')
|
| 251 |
-
print(f' ERROR: {e}')
|
| 252 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
|
| 253 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 254 |
-
continue
|
| 255 |
-
|
| 256 |
-
for line in log_lines:
|
| 257 |
-
print(f' {line}')
|
| 258 |
-
for line in log_lines:
|
| 259 |
-
if line.strip().startswith('ERROR'):
|
| 260 |
-
errors.append(line.strip())
|
| 261 |
-
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 262 |
-
|
| 263 |
-
print(' Finalising metadata...')
|
| 264 |
-
try:
|
| 265 |
-
ts_doc = docx_lib.Document(str(ts_applied))
|
| 266 |
-
rev = RevCounter(ts_doc)
|
| 267 |
-
|
| 268 |
-
pub_ym, pub_month_year = compute_pub_date()
|
| 269 |
-
old_v = version
|
| 270 |
-
|
| 271 |
-
title_text = ts_doc.paragraphs[0].text
|
| 272 |
-
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 273 |
-
old_date_str = date_match.group(1) if date_match else ''
|
| 274 |
-
|
| 275 |
-
print(f' Version: {old_v} -> {new_v}')
|
| 276 |
-
print(f' Publication: {pub_month_year} ({pub_ym})')
|
| 277 |
-
|
| 278 |
-
for uid in participating_uids:
|
| 279 |
-
try:
|
| 280 |
-
meta = extract_cr_metadata(str(cr_paths[uid]))
|
| 281 |
-
ch_cells = update_change_history_table(
|
| 282 |
-
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 283 |
-
)
|
| 284 |
-
print(f' [Change History] {uid}: {ch_cells}')
|
| 285 |
-
except NoChangeHistoryTable:
|
| 286 |
-
print(f' [Change History] {uid}: NOT PRESENT β this document has no Change History table (History table only)')
|
| 287 |
-
except Exception as e:
|
| 288 |
-
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 289 |
-
print(f' [Change History] {uid}: ERROR β {e}')
|
| 290 |
-
|
| 291 |
-
try:
|
| 292 |
-
h_cells = update_history_table(
|
| 293 |
-
ts_doc, new_v, pub_month_year, rev, author, tc_date
|
| 294 |
-
)
|
| 295 |
-
print(f' [History] {h_cells}')
|
| 296 |
-
except Exception as e:
|
| 297 |
-
errors.append(f'History table ERROR: {e}')
|
| 298 |
-
print(f' [History] ERROR β {e}')
|
| 299 |
-
|
| 300 |
-
if old_date_str:
|
| 301 |
-
try:
|
| 302 |
-
update_title_para(
|
| 303 |
-
ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
|
| 304 |
-
)
|
| 305 |
-
print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
|
| 306 |
-
except Exception as e:
|
| 307 |
-
errors.append(f'Title update ERROR: {e}')
|
| 308 |
-
print(f' [Title] ERROR β {e}')
|
| 309 |
-
else:
|
| 310 |
-
print(f' [Title] SKIP β no (YYYY-MM) pattern in: {title_text!r}')
|
| 311 |
-
|
| 312 |
-
ts_doc.save(str(ts_final))
|
| 313 |
-
print(f' Saved: {spec_compact}/{ts_final.name}')
|
| 314 |
-
print(f' Log: {spec_compact}/{log_path.name}')
|
| 315 |
-
report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
|
| 316 |
-
|
| 317 |
-
except Exception as e:
|
| 318 |
-
errors.append(f'Finalisation ERROR: {e}')
|
| 319 |
-
print(f' Finalisation ERROR: {e}')
|
| 320 |
-
report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
|
| 321 |
-
|
| 322 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 323 |
|
| 324 |
return report, cr_paths, ts_paths, spec_dirs
|
| 325 |
|
|
@@ -450,144 +485,13 @@ def main():
|
|
| 450 |
report = []
|
| 451 |
|
| 452 |
for (spec_number, version), uids in ts_groups.items():
|
| 453 |
-
ts_key = f'TS {spec_number} v{version}'
|
| 454 |
spec_compact = spec_number.replace(' ', '')
|
| 455 |
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 456 |
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
ts_final = spec_dir / f'{stem}.docx'
|
| 462 |
-
log_path = spec_dir / f'{stem}.log'
|
| 463 |
-
errors = []
|
| 464 |
-
|
| 465 |
-
print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
|
| 466 |
-
|
| 467 |
-
if (spec_number, version) not in ts_paths:
|
| 468 |
-
msg = 'TS DOCX not on disk β skipping'
|
| 469 |
-
print(f' SKIP: {msg}')
|
| 470 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
|
| 471 |
-
continue
|
| 472 |
-
|
| 473 |
-
ts_in = ts_paths[(spec_number, version)]
|
| 474 |
-
|
| 475 |
-
log_buf = io.StringIO()
|
| 476 |
-
tee = _TeeWriter(sys.stdout, log_buf)
|
| 477 |
-
|
| 478 |
-
with contextlib.redirect_stdout(tee):
|
| 479 |
-
log_header = (
|
| 480 |
-
f'Pipeline Log (retry)\n'
|
| 481 |
-
f'TS: {spec_number} v{version} -> v{new_v}\n'
|
| 482 |
-
f'CRs: {", ".join(uids)}\n'
|
| 483 |
-
f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
|
| 484 |
-
f'{"=" * 60}\n'
|
| 485 |
-
)
|
| 486 |
-
print(log_header, end='')
|
| 487 |
-
|
| 488 |
-
combined_manifest = []
|
| 489 |
-
participating_uids = []
|
| 490 |
-
|
| 491 |
-
for uid in uids:
|
| 492 |
-
if uid not in cr_paths:
|
| 493 |
-
errors.append(f'[{uid}] CR DOCX not found β skipped')
|
| 494 |
-
continue
|
| 495 |
-
print(f' Parsing {uid}... ', end='', flush=True)
|
| 496 |
-
try:
|
| 497 |
-
changes = parse_cr(cr_paths[uid])
|
| 498 |
-
combined_manifest.extend(changes)
|
| 499 |
-
participating_uids.append(uid)
|
| 500 |
-
print(f'{len(changes)} change(s)')
|
| 501 |
-
except Exception as e:
|
| 502 |
-
errors.append(f'[{uid}] parse ERROR: {e}')
|
| 503 |
-
print(f'ERROR: {e}')
|
| 504 |
-
|
| 505 |
-
if not combined_manifest:
|
| 506 |
-
print(' No changes parsed β skipping apply step.')
|
| 507 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path,
|
| 508 |
-
errors + ['No changes parsed']))
|
| 509 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 510 |
-
continue
|
| 511 |
-
|
| 512 |
-
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 513 |
-
try:
|
| 514 |
-
n_ok, n_skip, log_lines = apply_manifest(
|
| 515 |
-
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 516 |
-
)
|
| 517 |
-
except Exception as e:
|
| 518 |
-
errors.append(f'apply_manifest ERROR: {e}')
|
| 519 |
-
print(f' ERROR: {e}')
|
| 520 |
-
report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
|
| 521 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 522 |
-
continue
|
| 523 |
-
|
| 524 |
-
for line in log_lines:
|
| 525 |
-
print(f' {line}')
|
| 526 |
-
for line in log_lines:
|
| 527 |
-
if line.strip().startswith('ERROR'):
|
| 528 |
-
errors.append(line.strip())
|
| 529 |
-
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 530 |
-
|
| 531 |
-
print(' Finalising metadata...')
|
| 532 |
-
try:
|
| 533 |
-
ts_doc = docx_lib.Document(str(ts_applied))
|
| 534 |
-
rev = RevCounter(ts_doc)
|
| 535 |
-
|
| 536 |
-
pub_ym, pub_month_year = compute_pub_date()
|
| 537 |
-
old_v = version
|
| 538 |
-
|
| 539 |
-
title_text = ts_doc.paragraphs[0].text
|
| 540 |
-
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 541 |
-
old_date_str = date_match.group(1) if date_match else ''
|
| 542 |
-
|
| 543 |
-
print(f' Version: {old_v} -> {new_v}')
|
| 544 |
-
print(f' Publication: {pub_month_year} ({pub_ym})')
|
| 545 |
-
|
| 546 |
-
for uid in participating_uids:
|
| 547 |
-
try:
|
| 548 |
-
meta = extract_cr_metadata(str(cr_paths[uid]))
|
| 549 |
-
ch_cells = update_change_history_table(
|
| 550 |
-
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 551 |
-
)
|
| 552 |
-
print(f' [Change History] {uid}: {ch_cells}')
|
| 553 |
-
except NoChangeHistoryTable:
|
| 554 |
-
print(f' [Change History] {uid}: NOT PRESENT β this document has no Change History table (History table only)')
|
| 555 |
-
except Exception as e:
|
| 556 |
-
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 557 |
-
print(f' [Change History] {uid}: ERROR β {e}')
|
| 558 |
-
|
| 559 |
-
try:
|
| 560 |
-
h_cells = update_history_table(
|
| 561 |
-
ts_doc, new_v, pub_month_year, rev, author, tc_date
|
| 562 |
-
)
|
| 563 |
-
print(f' [History] {h_cells}')
|
| 564 |
-
except Exception as e:
|
| 565 |
-
errors.append(f'History table ERROR: {e}')
|
| 566 |
-
print(f' [History] ERROR β {e}')
|
| 567 |
-
|
| 568 |
-
if old_date_str:
|
| 569 |
-
try:
|
| 570 |
-
update_title_para(
|
| 571 |
-
ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
|
| 572 |
-
)
|
| 573 |
-
print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
|
| 574 |
-
except Exception as e:
|
| 575 |
-
errors.append(f'Title update ERROR: {e}')
|
| 576 |
-
print(f' [Title] ERROR β {e}')
|
| 577 |
-
else:
|
| 578 |
-
print(f' [Title] SKIP β no (YYYY-MM) pattern in: {title_text!r}')
|
| 579 |
-
|
| 580 |
-
ts_doc.save(str(ts_final))
|
| 581 |
-
print(f' Saved: {spec_compact}/{ts_final.name}')
|
| 582 |
-
print(f' Log: {spec_compact}/{log_path.name}')
|
| 583 |
-
report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
|
| 584 |
-
|
| 585 |
-
except Exception as e:
|
| 586 |
-
errors.append(f'Finalisation ERROR: {e}')
|
| 587 |
-
print(f' Finalisation ERROR: {e}')
|
| 588 |
-
report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
|
| 589 |
-
|
| 590 |
-
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 591 |
|
| 592 |
# Update failed_ts.json β remove entries that are now resolved
|
| 593 |
still_failed = [
|
|
@@ -601,11 +505,7 @@ def main():
|
|
| 601 |
n_partial = sum(1 for r in report if r[4] is not None and r[6])
|
| 602 |
n_failed = sum(1 for r in report if r[4] is None)
|
| 603 |
print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
|
| 604 |
-
|
| 605 |
-
status_tag = 'OK' if out_path and not errors else ('WARN' if out_path else 'SKIP')
|
| 606 |
-
print(f' [{status_tag}] {ts_key}')
|
| 607 |
-
for err in errors:
|
| 608 |
-
print(f' ! {err}')
|
| 609 |
return
|
| 610 |
|
| 611 |
# ββ TS mode β load HF index, skip Steps 1 & 3 ββββββββββββββββββββββββββββ
|
|
@@ -656,12 +556,7 @@ def main():
|
|
| 656 |
# Copy the CRs actually applied into the run output dir so the ZIP
|
| 657 |
# contains exactly the CRs used for this TS (only needed when using
|
| 658 |
# a shared CR cache that lives outside output_dir).
|
| 659 |
-
|
| 660 |
-
if cr_dir.resolve() != _run_cr_dir.resolve():
|
| 661 |
-
_run_cr_dir.mkdir(parents=True, exist_ok=True)
|
| 662 |
-
for _p in cr_paths.values():
|
| 663 |
-
if _p.exists():
|
| 664 |
-
shutil.copy2(_p, _run_cr_dir / _p.name)
|
| 665 |
|
| 666 |
_section('Final Report (TS mode)')
|
| 667 |
n_success = sum(1 for r in report if r[4] is not None and not r[6])
|
|
@@ -673,16 +568,7 @@ def main():
|
|
| 673 |
print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
|
| 674 |
print()
|
| 675 |
|
| 676 |
-
|
| 677 |
-
status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
|
| 678 |
-
print(f' [{status}] {ts_key}')
|
| 679 |
-
print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
|
| 680 |
-
if out_path:
|
| 681 |
-
print(f' Output: {out_path.parent.name}/{out_path.name}')
|
| 682 |
-
if log_path and log_path.exists():
|
| 683 |
-
print(f' Log: {log_path.parent.name}/{log_path.name}')
|
| 684 |
-
for err in errors:
|
| 685 |
-
print(f' ! {err}')
|
| 686 |
|
| 687 |
print()
|
| 688 |
print(f'Output directory: {output_dir}/')
|
|
@@ -716,12 +602,7 @@ def main():
|
|
| 716 |
# Copy the CRs actually applied into the run output dir so the ZIP
|
| 717 |
# contains exactly the CRs used for this run (only needed when using
|
| 718 |
# a shared CR cache that lives outside output_dir).
|
| 719 |
-
|
| 720 |
-
if cr_dir.resolve() != _run_cr_dir.resolve():
|
| 721 |
-
_run_cr_dir.mkdir(parents=True, exist_ok=True)
|
| 722 |
-
for _p in cr_paths.values():
|
| 723 |
-
if _p.exists():
|
| 724 |
-
shutil.copy2(_p, _run_cr_dir / _p.name)
|
| 725 |
|
| 726 |
# ββ Final Report ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 727 |
_section('Final Report')
|
|
@@ -735,21 +616,7 @@ def main():
|
|
| 735 |
print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
|
| 736 |
print()
|
| 737 |
|
| 738 |
-
|
| 739 |
-
if out_path and not errors:
|
| 740 |
-
status = 'OK'
|
| 741 |
-
elif out_path:
|
| 742 |
-
status = 'WARN'
|
| 743 |
-
else:
|
| 744 |
-
status = 'FAIL'
|
| 745 |
-
print(f' [{status}] {ts_key}')
|
| 746 |
-
print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
|
| 747 |
-
if out_path:
|
| 748 |
-
print(f' Output: {out_path.parent.name}/{out_path.name}')
|
| 749 |
-
if log_path and log_path.exists():
|
| 750 |
-
print(f' Log: {log_path.parent.name}/{log_path.name}')
|
| 751 |
-
for err in errors:
|
| 752 |
-
print(f' ! {err}')
|
| 753 |
|
| 754 |
print()
|
| 755 |
print(f'Output directory: {output_dir}/')
|
|
|
|
| 81 |
self._real.flush()
|
| 82 |
|
| 83 |
|
| 84 |
+
# ββ Small report / cache helpers βββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
|
| 86 |
+
def _print_report(report, *, detailed=True):
|
| 87 |
+
"""Print per-TS result lines from a report list."""
|
| 88 |
+
for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
|
| 89 |
+
status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
|
| 90 |
+
print(f' [{status}] {ts_key}')
|
| 91 |
+
if detailed:
|
| 92 |
+
print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
|
| 93 |
+
if out_path:
|
| 94 |
+
print(f' Output: {out_path.parent.name}/{out_path.name}')
|
| 95 |
+
if log_path and log_path.exists():
|
| 96 |
+
print(f' Log: {log_path.parent.name}/{log_path.name}')
|
| 97 |
+
for err in errors:
|
| 98 |
+
print(f' ! {err}')
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir):
|
| 102 |
+
"""Copy downloaded CRs into output_dir/CRs when a shared cache is used."""
|
| 103 |
+
run_cr_dir = output_dir / 'CRs'
|
| 104 |
+
if cr_dir.resolve() != run_cr_dir.resolve():
|
| 105 |
+
run_cr_dir.mkdir(parents=True, exist_ok=True)
|
| 106 |
+
for p in cr_paths.values():
|
| 107 |
+
if p.exists():
|
| 108 |
+
shutil.copy2(p, run_cr_dir / p.name)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ββ Per-TS-group apply helper βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 112 |
+
|
| 113 |
+
def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
|
| 114 |
+
author, tc_date, log_label='Pipeline Log'):
|
| 115 |
+
"""Parse, apply, and finalise one TS group. Returns one report tuple."""
|
| 116 |
+
ts_key = f'TS {spec_number} v{version}'
|
| 117 |
+
spec_compact = spec_number.replace(' ', '')
|
| 118 |
+
new_v = derive_new_version(version)
|
| 119 |
+
stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
|
| 120 |
+
ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
|
| 121 |
+
ts_final = spec_dir / f'{stem}.docx'
|
| 122 |
+
log_path = spec_dir / f'{stem}.log'
|
| 123 |
+
errors = []
|
| 124 |
+
|
| 125 |
+
print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
|
| 126 |
+
|
| 127 |
+
if (spec_number, version) not in ts_paths:
|
| 128 |
+
msg = 'TS download failed β skipping'
|
| 129 |
+
print(f' SKIP: {msg}')
|
| 130 |
+
return (ts_key, 0, 0, len(uids), None, log_path, [msg])
|
| 131 |
+
|
| 132 |
+
ts_in = ts_paths[(spec_number, version)]
|
| 133 |
+
|
| 134 |
+
log_buf = io.StringIO()
|
| 135 |
+
tee = _TeeWriter(sys.stdout, log_buf)
|
| 136 |
+
|
| 137 |
+
with contextlib.redirect_stdout(tee):
|
| 138 |
+
log_header = (
|
| 139 |
+
f'{log_label}\n'
|
| 140 |
+
f'TS: {spec_number} v{version} -> v{new_v}\n'
|
| 141 |
+
f'CRs: {", ".join(uids)}\n'
|
| 142 |
+
f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
|
| 143 |
+
f'{"=" * 60}\n'
|
| 144 |
+
)
|
| 145 |
+
print(log_header, end='')
|
| 146 |
+
|
| 147 |
+
combined_manifest = []
|
| 148 |
+
participating_uids = []
|
| 149 |
+
|
| 150 |
+
for uid in uids:
|
| 151 |
+
if uid not in cr_paths:
|
| 152 |
+
errors.append(f'[{uid}] CR download had failed β skipped')
|
| 153 |
+
continue
|
| 154 |
+
print(f' Parsing {uid}... ', end='', flush=True)
|
| 155 |
+
try:
|
| 156 |
+
changes = parse_cr(cr_paths[uid])
|
| 157 |
+
combined_manifest.extend(changes)
|
| 158 |
+
participating_uids.append(uid)
|
| 159 |
+
print(f'{len(changes)} change(s)')
|
| 160 |
+
except Exception as e:
|
| 161 |
+
errors.append(f'[{uid}] parse ERROR: {e}')
|
| 162 |
+
print(f'ERROR: {e}')
|
| 163 |
+
|
| 164 |
+
if not combined_manifest:
|
| 165 |
+
print(' No changes parsed β skipping apply step.')
|
| 166 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 167 |
+
return (ts_key, 0, 0, len(uids), None, log_path,
|
| 168 |
+
errors + ['No changes parsed'])
|
| 169 |
+
|
| 170 |
+
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 171 |
+
try:
|
| 172 |
+
n_ok, n_skip, log_lines = apply_manifest(
|
| 173 |
+
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 174 |
+
)
|
| 175 |
+
except Exception as e:
|
| 176 |
+
errors.append(f'apply_manifest ERROR: {e}')
|
| 177 |
+
print(f' ERROR: {e}')
|
| 178 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 179 |
+
return (ts_key, 0, 0, len(uids), None, log_path, errors)
|
| 180 |
+
|
| 181 |
+
for line in log_lines:
|
| 182 |
+
print(f' {line}')
|
| 183 |
+
for line in log_lines:
|
| 184 |
+
if line.strip().startswith('ERROR'):
|
| 185 |
+
errors.append(line.strip())
|
| 186 |
+
print(f' -> Applied: {n_ok} Skipped: {n_skip}')
|
| 187 |
+
|
| 188 |
+
print(' Finalising metadata...')
|
| 189 |
+
ts_final_or_applied = ts_applied # fallback if finalise raises
|
| 190 |
+
try:
|
| 191 |
+
ts_doc = docx_lib.Document(str(ts_applied))
|
| 192 |
+
rev = RevCounter(ts_doc)
|
| 193 |
+
|
| 194 |
+
pub_ym, pub_month_year = compute_pub_date()
|
| 195 |
+
old_v = version
|
| 196 |
+
|
| 197 |
+
title_text = ts_doc.paragraphs[0].text
|
| 198 |
+
date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
|
| 199 |
+
old_date_str = date_match.group(1) if date_match else ''
|
| 200 |
+
|
| 201 |
+
print(f' Version: {old_v} -> {new_v}')
|
| 202 |
+
print(f' Publication: {pub_month_year} ({pub_ym})')
|
| 203 |
+
|
| 204 |
+
for uid in participating_uids:
|
| 205 |
+
try:
|
| 206 |
+
meta = extract_cr_metadata(str(cr_paths[uid]))
|
| 207 |
+
ch_cells = update_change_history_table(
|
| 208 |
+
ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
|
| 209 |
+
)
|
| 210 |
+
print(f' [Change History] {uid}: {ch_cells}')
|
| 211 |
+
except NoChangeHistoryTable:
|
| 212 |
+
print(f' [Change History] {uid}: NOT PRESENT β this document has no Change History table (History table only)')
|
| 213 |
+
except Exception as e:
|
| 214 |
+
errors.append(f'[{uid}] Change History ERROR: {e}')
|
| 215 |
+
print(f' [Change History] {uid}: ERROR β {e}')
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
h_cells = update_history_table(
|
| 219 |
+
ts_doc, new_v, pub_month_year, rev, author, tc_date
|
| 220 |
+
)
|
| 221 |
+
print(f' [History] {h_cells}')
|
| 222 |
+
except Exception as e:
|
| 223 |
+
errors.append(f'History table ERROR: {e}')
|
| 224 |
+
print(f' [History] ERROR β {e}')
|
| 225 |
+
|
| 226 |
+
if old_date_str:
|
| 227 |
+
try:
|
| 228 |
+
update_title_para(
|
| 229 |
+
ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
|
| 230 |
+
)
|
| 231 |
+
print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
|
| 232 |
+
except Exception as e:
|
| 233 |
+
errors.append(f'Title update ERROR: {e}')
|
| 234 |
+
print(f' [Title] ERROR β {e}')
|
| 235 |
+
else:
|
| 236 |
+
print(f' [Title] SKIP β no (YYYY-MM) pattern in: {title_text!r}')
|
| 237 |
+
|
| 238 |
+
ts_doc.save(str(ts_final))
|
| 239 |
+
print(f' Saved: {spec_compact}/{ts_final.name}')
|
| 240 |
+
print(f' Log: {spec_compact}/{log_path.name}')
|
| 241 |
+
ts_final_or_applied = ts_final
|
| 242 |
+
|
| 243 |
+
except Exception as e:
|
| 244 |
+
errors.append(f'Finalisation ERROR: {e}')
|
| 245 |
+
print(f' Finalisation ERROR: {e}')
|
| 246 |
+
|
| 247 |
+
log_path.write_text(log_buf.getvalue(), encoding='utf-8')
|
| 248 |
+
return (ts_key, n_ok, n_skip, len(uids), ts_final_or_applied, log_path, errors)
|
| 249 |
+
|
| 250 |
+
|
| 251 |
# ββ Shared Steps 2, 4, 5, 6 ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 252 |
|
| 253 |
def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
|
|
|
|
| 349 |
report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
|
| 350 |
|
| 351 |
for (spec_number, version), uids in ts_groups.items():
|
|
|
|
| 352 |
spec_compact = spec_number.replace(' ', '')
|
| 353 |
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 354 |
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 355 |
+
report.append(_apply_ts_group(
|
| 356 |
+
spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date
|
| 357 |
+
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
return report, cr_paths, ts_paths, spec_dirs
|
| 360 |
|
|
|
|
| 485 |
report = []
|
| 486 |
|
| 487 |
for (spec_number, version), uids in ts_groups.items():
|
|
|
|
| 488 |
spec_compact = spec_number.replace(' ', '')
|
| 489 |
spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
|
| 490 |
spec_dir.mkdir(parents=True, exist_ok=True)
|
| 491 |
+
report.append(_apply_ts_group(
|
| 492 |
+
spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date,
|
| 493 |
+
log_label='Pipeline Log (retry)'
|
| 494 |
+
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
# Update failed_ts.json β remove entries that are now resolved
|
| 497 |
still_failed = [
|
|
|
|
| 505 |
n_partial = sum(1 for r in report if r[4] is not None and r[6])
|
| 506 |
n_failed = sum(1 for r in report if r[4] is None)
|
| 507 |
print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
|
| 508 |
+
_print_report(report, detailed=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
return
|
| 510 |
|
| 511 |
# ββ TS mode β load HF index, skip Steps 1 & 3 ββββββββββββββββββββββββββββ
|
|
|
|
| 556 |
# Copy the CRs actually applied into the run output dir so the ZIP
|
| 557 |
# contains exactly the CRs used for this TS (only needed when using
|
| 558 |
# a shared CR cache that lives outside output_dir).
|
| 559 |
+
_copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 560 |
|
| 561 |
_section('Final Report (TS mode)')
|
| 562 |
n_success = sum(1 for r in report if r[4] is not None and not r[6])
|
|
|
|
| 568 |
print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
|
| 569 |
print()
|
| 570 |
|
| 571 |
+
_print_report(report)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 572 |
|
| 573 |
print()
|
| 574 |
print(f'Output directory: {output_dir}/')
|
|
|
|
| 602 |
# Copy the CRs actually applied into the run output dir so the ZIP
|
| 603 |
# contains exactly the CRs used for this run (only needed when using
|
| 604 |
# a shared CR cache that lives outside output_dir).
|
| 605 |
+
_copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
|
| 607 |
# ββ Final Report ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 608 |
_section('Final Report')
|
|
|
|
| 616 |
print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
|
| 617 |
print()
|
| 618 |
|
| 619 |
+
_print_report(report)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
print()
|
| 622 |
print(f'Output directory: {output_dir}/')
|
scripts/ts_applicator.py
CHANGED
|
@@ -21,6 +21,10 @@ from docx.oxml import OxmlElement
|
|
| 21 |
from docx.oxml.ns import qn
|
| 22 |
|
| 23 |
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
from docx_helpers import (
|
| 25 |
RevCounter,
|
| 26 |
tracked_modify_para,
|
|
@@ -32,24 +36,29 @@ from docx_helpers import (
|
|
| 32 |
|
| 33 |
# ββ Text normalisation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def _norm(text):
|
| 36 |
"""Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
.replace('\u2007', ' ') # figure space
|
| 41 |
-
.replace('\u2060', '') # word joiner (invisible)
|
| 42 |
-
.replace('\u200b', '') # zero-width space
|
| 43 |
-
.replace('\u00ad', '') # soft hyphen (invisible)
|
| 44 |
-
.replace('\u2011', '-') # non-breaking hyphen
|
| 45 |
-
.replace('\u2013', '-') # en dash
|
| 46 |
-
.replace('\u2014', '-') # em dash
|
| 47 |
-
.replace('\u2212', '-') # minus sign
|
| 48 |
-
.replace('\u2018', "'") # left single quote
|
| 49 |
-
.replace('\u2019', "'") # right single quote
|
| 50 |
-
.replace('\u201c', '"') # left double quote
|
| 51 |
-
.replace('\u201d', '"') # right double quote
|
| 52 |
-
.strip())
|
| 53 |
|
| 54 |
|
| 55 |
def _norm_ws(text):
|
|
@@ -70,22 +79,9 @@ def _norm_ws(text):
|
|
| 70 |
Removing all whitespace from both sides before comparing solves this.
|
| 71 |
Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
|
| 72 |
"""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
.replace('\u2007', '')
|
| 77 |
-
.replace('\u2060', '')
|
| 78 |
-
.replace('\u200b', '')
|
| 79 |
-
.replace('\u00ad', '')
|
| 80 |
-
.replace('\u2011', '-')
|
| 81 |
-
.replace('\u2013', '-')
|
| 82 |
-
.replace('\u2014', '-')
|
| 83 |
-
.replace('\u2212', '-')
|
| 84 |
-
.replace('\u2018', "'")
|
| 85 |
-
.replace('\u2019', "'")
|
| 86 |
-
.replace('\u201c', '"')
|
| 87 |
-
.replace('\u201d', '"'))
|
| 88 |
-
return re.sub(r'\s+', '', base)
|
| 89 |
|
| 90 |
|
| 91 |
def _norm_alnum(text):
|
|
@@ -225,14 +221,14 @@ def _find_table(doc, header_key):
|
|
| 225 |
for tbl in doc.tables:
|
| 226 |
if not tbl.rows:
|
| 227 |
continue
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
|
| 237 |
return None, 0.0
|
| 238 |
|
|
@@ -453,6 +449,20 @@ def _apply_section_replace(doc, change, rev, author, date, log):
|
|
| 453 |
break
|
| 454 |
else:
|
| 455 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
# ββ Clone and remap IDs on the CR elements βββββββββββββββββββββββββββββββββ
|
| 458 |
cloned = []
|
|
@@ -522,7 +532,10 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 522 |
for para in cell.paragraphs:
|
| 523 |
if old in para.text:
|
| 524 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 525 |
-
|
|
|
|
|
|
|
|
|
|
| 526 |
return True
|
| 527 |
log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
|
| 528 |
return False
|
|
@@ -546,7 +559,11 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 546 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 547 |
log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β {new!r}")
|
| 548 |
return True
|
| 549 |
-
# Final fallback: scan ALL columns of ALL tables
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
_all_start = tbl_by_section if tbl_by_section is not None else tbl
|
| 551 |
for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
|
| 552 |
for r_idx, row in enumerate(search_tbl.rows):
|
|
@@ -554,7 +571,9 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 554 |
for para in cell.paragraphs:
|
| 555 |
if old in para.text:
|
| 556 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 557 |
-
log.append(f"
|
|
|
|
|
|
|
| 558 |
return True
|
| 559 |
log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
|
| 560 |
return False
|
|
@@ -606,6 +625,7 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
|
|
| 606 |
tbl_by_section, _ = _find_table_by_section(doc, section_heading)
|
| 607 |
if tbl_by_section is not None:
|
| 608 |
tbl = tbl_by_section
|
|
|
|
| 609 |
else:
|
| 610 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 611 |
if tbl is None:
|
|
@@ -636,7 +656,9 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
|
|
| 636 |
last_inserted[key] = new_tr
|
| 637 |
|
| 638 |
desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
|
| 639 |
-
|
|
|
|
|
|
|
| 640 |
return True
|
| 641 |
|
| 642 |
|
|
|
|
| 21 |
from docx.oxml.ns import qn
|
| 22 |
|
| 23 |
sys.path.insert(0, str(Path(__file__).parent))
|
| 24 |
+
|
| 25 |
+
_MIN_LEN_ALLCOL_FALLBACK = 8 # old text shorter than this is too ambiguous for any-column search
|
| 26 |
+
_WARN_CONF = 0.8 # confidence below this emits WARN instead of OK
|
| 27 |
+
|
| 28 |
from docx_helpers import (
|
| 29 |
RevCounter,
|
| 30 |
tracked_modify_para,
|
|
|
|
| 36 |
|
| 37 |
# ββ Text normalisation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 38 |
|
| 39 |
+
_UNICODE_REPLACEMENTS = (
|
| 40 |
+
('\xa0', ' '), # non-breaking space
|
| 41 |
+
('\u202f', ' '), # narrow no-break space
|
| 42 |
+
('\u2007', ' '), # figure space
|
| 43 |
+
('\u2060', ''), # word joiner (invisible)
|
| 44 |
+
('\u200b', ''), # zero-width space
|
| 45 |
+
('\u00ad', ''), # soft hyphen (invisible)
|
| 46 |
+
('\u2011', '-'), # non-breaking hyphen
|
| 47 |
+
('\u2013', '-'), # en dash
|
| 48 |
+
('\u2014', '-'), # em dash
|
| 49 |
+
('\u2212', '-'), # minus sign
|
| 50 |
+
('\u2018', "'"), # left single quote
|
| 51 |
+
('\u2019', "'"), # right single quote
|
| 52 |
+
('\u201c', '"'), # left double quote
|
| 53 |
+
('\u201d', '"'), # right double quote
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
def _norm(text):
|
| 58 |
"""Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
|
| 59 |
+
for old, new in _UNICODE_REPLACEMENTS:
|
| 60 |
+
text = text.replace(old, new)
|
| 61 |
+
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
|
| 64 |
def _norm_ws(text):
|
|
|
|
| 79 |
Removing all whitespace from both sides before comparing solves this.
|
| 80 |
Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
|
| 81 |
"""
|
| 82 |
+
for old, new in _UNICODE_REPLACEMENTS:
|
| 83 |
+
text = text.replace(old, new)
|
| 84 |
+
return re.sub(r'\s+', '', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
def _norm_alnum(text):
|
|
|
|
| 221 |
for tbl in doc.tables:
|
| 222 |
if not tbl.rows:
|
| 223 |
continue
|
| 224 |
+
for row in tbl.rows[:3]: # check first 3 rows β header may not be row 0
|
| 225 |
+
row_texts = [_norm(c.text) for c in row.cells]
|
| 226 |
+
match = all(
|
| 227 |
+
i < len(row_texts) and norm_key[i] in row_texts[i]
|
| 228 |
+
for i in range(len(norm_key))
|
| 229 |
+
)
|
| 230 |
+
if match:
|
| 231 |
+
return tbl, 1.0
|
| 232 |
|
| 233 |
return None, 0.0
|
| 234 |
|
|
|
|
| 449 |
break
|
| 450 |
else:
|
| 451 |
break
|
| 452 |
+
# Validate the candidate table matches what the CR says should be deleted
|
| 453 |
+
if ts_tbl_elem is not None and elements_xml:
|
| 454 |
+
cr_tbl_xmls = [x for x in elements_xml if '<w:tbl' in x]
|
| 455 |
+
if cr_tbl_xmls:
|
| 456 |
+
from lxml import etree as _etree
|
| 457 |
+
cr_tbl_el = _etree.fromstring(cr_tbl_xmls[0].encode())
|
| 458 |
+
cr_hdr = ''.join(t.text or '' for t in
|
| 459 |
+
cr_tbl_el.findall('.//' + qn('w:t'))[:10]).lower()
|
| 460 |
+
ts_hdr = ''.join(t.text or '' for t in
|
| 461 |
+
ts_tbl_elem.findall('.//' + qn('w:t'))[:10]).lower()
|
| 462 |
+
if cr_hdr and cr_hdr not in ts_hdr and ts_hdr not in cr_hdr:
|
| 463 |
+
log.append(' WARN section_replace: candidate table header mismatch'
|
| 464 |
+
' β skipping table removal')
|
| 465 |
+
ts_tbl_elem = None
|
| 466 |
|
| 467 |
# ββ Clone and remap IDs on the CR elements βββββββββββββββββββββββββββββββββ
|
| 468 |
cloned = []
|
|
|
|
| 532 |
for para in cell.paragraphs:
|
| 533 |
if old in para.text:
|
| 534 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 535 |
+
_pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK '
|
| 536 |
+
log.append(f" {_pfx} text_replace (table_cell"
|
| 537 |
+
f" t_conf={t_conf:.1f} r_conf={r_conf:.1f}"
|
| 538 |
+
f" row={row_idx} col={col_idx}): {old!r} β {new!r}")
|
| 539 |
return True
|
| 540 |
log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
|
| 541 |
return False
|
|
|
|
| 559 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 560 |
log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β {new!r}")
|
| 561 |
return True
|
| 562 |
+
# Final fallback: scan ALL columns of ALL tables (guarded by min length)
|
| 563 |
+
if len(old) < _MIN_LEN_ALLCOL_FALLBACK:
|
| 564 |
+
log.append(f" ERROR text_replace: {old!r} too short for all-column fallback"
|
| 565 |
+
f" (ambiguous β skipped)")
|
| 566 |
+
return False
|
| 567 |
_all_start = tbl_by_section if tbl_by_section is not None else tbl
|
| 568 |
for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
|
| 569 |
for r_idx, row in enumerate(search_tbl.rows):
|
|
|
|
| 571 |
for para in cell.paragraphs:
|
| 572 |
if old in para.text:
|
| 573 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 574 |
+
log.append(f" WARN text_replace (table_cell any_col"
|
| 575 |
+
f" row={r_idx} col={c_idx} β low confidence):"
|
| 576 |
+
f" {old!r} β {new!r}")
|
| 577 |
return True
|
| 578 |
log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
|
| 579 |
return False
|
|
|
|
| 625 |
tbl_by_section, _ = _find_table_by_section(doc, section_heading)
|
| 626 |
if tbl_by_section is not None:
|
| 627 |
tbl = tbl_by_section
|
| 628 |
+
t_conf = 1.0
|
| 629 |
else:
|
| 630 |
tbl, t_conf = _find_table(doc, loc['table_header'])
|
| 631 |
if tbl is None:
|
|
|
|
| 656 |
last_inserted[key] = new_tr
|
| 657 |
|
| 658 |
desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
|
| 659 |
+
_pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK '
|
| 660 |
+
log.append(f" {_pfx} row_insert (t_conf={t_conf:.1f} r_conf={r_conf:.1f})"
|
| 661 |
+
f" after row[{row_idx}] ({after_anchor!r}): {desc!r}")
|
| 662 |
return True
|
| 663 |
|
| 664 |
|