Spaces:
Sleeping
Sleeping
fix minor errors
Browse files- scripts/cr_parser.py +63 -7
- scripts/etsi_client.py +24 -9
- scripts/fetch_crs.py +13 -4
- scripts/finalize_ts.py +20 -21
- scripts/orchestrate_cr.py +3 -2
- scripts/ts_applicator.py +317 -70
scripts/cr_parser.py
CHANGED
|
@@ -57,6 +57,46 @@ def _style_val(p_elem):
|
|
| 57 |
return None
|
| 58 |
return pStyle.get(qn('w:val'))
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
def _is_rpr_ins(ins_elem):
|
| 61 |
"""True if w:ins is inside w:rPr β a formatting change, not a content insertion."""
|
| 62 |
p = ins_elem.getparent()
|
|
@@ -185,7 +225,7 @@ def _extract_inline_replacements(p_elem):
|
|
| 185 |
|
| 186 |
# ββ Table change extraction βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
|
| 188 |
-
def _parse_table(tbl_elem, changes, section_heading=''):
|
| 189 |
header = _table_header(tbl_elem)
|
| 190 |
header_key = header[:3] # first 3 columns enough for matching
|
| 191 |
rows = tbl_elem.findall(qn('w:tr'))
|
|
@@ -195,14 +235,18 @@ def _parse_table(tbl_elem, changes, section_heading=''):
|
|
| 195 |
|
| 196 |
# ββ Tracked row insertion βββββββββββββββββββββββββββββββββββββββββ
|
| 197 |
if trPr is not None and trPr.find(qn('w:ins')) is not None:
|
| 198 |
-
# Find preceding stable
|
| 199 |
-
|
| 200 |
for prev_idx in range(tr_idx - 1, -1, -1):
|
| 201 |
prev_tr = rows[prev_idx]
|
| 202 |
prev_trPr = prev_tr.find(qn('w:trPr'))
|
| 203 |
if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
cells = []
|
| 208 |
for tc in tr.findall(qn('w:tc')):
|
|
@@ -244,7 +288,9 @@ def _parse_table(tbl_elem, changes, section_heading=''):
|
|
| 244 |
'kind': 'table_row',
|
| 245 |
'table_header': header_key,
|
| 246 |
'after_row_anchor': after_anchor,
|
|
|
|
| 247 |
'section_heading': section_heading,
|
|
|
|
| 248 |
},
|
| 249 |
'cells': cells,
|
| 250 |
})
|
|
@@ -266,6 +312,7 @@ def _parse_table(tbl_elem, changes, section_heading=''):
|
|
| 266 |
'row_anchor': row_anchor,
|
| 267 |
'col_idx': col_idx,
|
| 268 |
'section_heading': section_heading,
|
|
|
|
| 269 |
},
|
| 270 |
'old': old_text,
|
| 271 |
'new': new_text,
|
|
@@ -292,6 +339,7 @@ def _parse_body(body, changes):
|
|
| 292 |
from lxml import etree
|
| 293 |
|
| 294 |
prev_stable_text = ''
|
|
|
|
| 295 |
|
| 296 |
# ββ Section-replace accumulator βββββββββββββββββββββββββββββββββββββββββββ
|
| 297 |
sec_del = [] # fully-deleted elements (CR del block)
|
|
@@ -340,6 +388,7 @@ def _parse_body(body, changes):
|
|
| 340 |
'del_heading': del_heading,
|
| 341 |
'has_del_table': has_del_table,
|
| 342 |
'anchor_text': sec_anchor,
|
|
|
|
| 343 |
},
|
| 344 |
'elements_xml': elements_xml,
|
| 345 |
})
|
|
@@ -371,6 +420,7 @@ def _parse_body(body, changes):
|
|
| 371 |
'location': {
|
| 372 |
'kind': 'body',
|
| 373 |
'anchor_text': prev_stable_text,
|
|
|
|
| 374 |
},
|
| 375 |
'paragraphs': paras,
|
| 376 |
})
|
|
@@ -380,6 +430,10 @@ def _parse_body(body, changes):
|
|
| 380 |
tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
|
| 381 |
|
| 382 |
if tag == 'p':
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
is_del = _is_deleted_para(elem)
|
| 384 |
is_ins = _is_inserted_para(elem)
|
| 385 |
is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
|
|
@@ -430,13 +484,14 @@ def _parse_body(body, changes):
|
|
| 430 |
'location': {
|
| 431 |
'kind': 'body_para',
|
| 432 |
'para_context': _para_orig_text(elem).strip(),
|
|
|
|
| 433 |
},
|
| 434 |
'old': old_text,
|
| 435 |
'new': new_text,
|
| 436 |
})
|
| 437 |
|
| 438 |
orig = _para_orig_text(elem).strip()
|
| 439 |
-
if orig and not
|
| 440 |
prev_stable_text = orig
|
| 441 |
|
| 442 |
elif tag == 'tbl':
|
|
@@ -464,7 +519,8 @@ def _parse_body(body, changes):
|
|
| 464 |
# Table with inline cell changes
|
| 465 |
flush_section()
|
| 466 |
flush_group()
|
| 467 |
-
_parse_table(elem, changes, section_heading=prev_stable_text
|
|
|
|
| 468 |
|
| 469 |
flush_section()
|
| 470 |
flush_group()
|
|
|
|
| 57 |
return None
|
| 58 |
return pStyle.get(qn('w:val'))
|
| 59 |
|
| 60 |
+
|
| 61 |
+
_HEADING_NUM_RE = re.compile(r'^(\d+(?:\.\d+)*)\s+\S')
|
| 62 |
+
_SKIP_MARKER_RE = re.compile(r'^[\[\(]?\s*(?:\.{3}|β¦)\s*[\]\)]?$')
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _para_text_with_tabs(p_elem):
|
| 66 |
+
"""Paragraph text with w:tab elements rendered as '\\t'.
|
| 67 |
+
Used for heading detection since ETSI headings store the number and title in
|
| 68 |
+
separate runs separated by <w:tab/>, which _para_orig_text would drop."""
|
| 69 |
+
parts = []
|
| 70 |
+
for node in p_elem.iter():
|
| 71 |
+
if node.tag == qn('w:t') and node.text:
|
| 72 |
+
if not any(a.tag == qn('w:ins') for a in node.iterancestors()):
|
| 73 |
+
parts.append(node.text)
|
| 74 |
+
elif node.tag == qn('w:delText') and node.text:
|
| 75 |
+
parts.append(node.text)
|
| 76 |
+
elif node.tag == qn('w:tab'):
|
| 77 |
+
parts.append('\t')
|
| 78 |
+
return ''.join(parts)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _heading_number(p_elem):
|
| 82 |
+
"""Return dotted section number if this paragraph is a numbered heading, else None.
|
| 83 |
+
Requires the paragraph style to start with 'Heading' (case-insensitive) β this
|
| 84 |
+
prevents false positives from body paragraphs whose text starts with a digit,
|
| 85 |
+
notably bit-description lines like "1 = alphabet set." (style B30) that appear
|
| 86 |
+
in Terminal Profile sections."""
|
| 87 |
+
style = (_style_val(p_elem) or '').lower()
|
| 88 |
+
if not style.startswith('heading'):
|
| 89 |
+
return None
|
| 90 |
+
text = _para_text_with_tabs(p_elem).strip()
|
| 91 |
+
m = _HEADING_NUM_RE.match(text)
|
| 92 |
+
return m.group(1) if m else None
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _is_skip_marker(text):
|
| 96 |
+
"""True for [...] / [β¦] / ... / β¦ / (...) / (β¦) after .strip()."""
|
| 97 |
+
return bool(_SKIP_MARKER_RE.match(text.strip()))
|
| 98 |
+
|
| 99 |
+
|
| 100 |
def _is_rpr_ins(ins_elem):
|
| 101 |
"""True if w:ins is inside w:rPr β a formatting change, not a content insertion."""
|
| 102 |
p = ins_elem.getparent()
|
|
|
|
| 225 |
|
| 226 |
# ββ Table change extraction βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 227 |
|
| 228 |
+
def _parse_table(tbl_elem, changes, section_heading='', section_number=''):
|
| 229 |
header = _table_header(tbl_elem)
|
| 230 |
header_key = header[:3] # first 3 columns enough for matching
|
| 231 |
rows = tbl_elem.findall(qn('w:tr'))
|
|
|
|
| 235 |
|
| 236 |
# ββ Tracked row insertion βββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
if trPr is not None and trPr.find(qn('w:ins')) is not None:
|
| 238 |
+
# Find preceding stable rows for anchor + context disambiguation
|
| 239 |
+
stable_before = []
|
| 240 |
for prev_idx in range(tr_idx - 1, -1, -1):
|
| 241 |
prev_tr = rows[prev_idx]
|
| 242 |
prev_trPr = prev_tr.find(qn('w:trPr'))
|
| 243 |
if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
|
| 244 |
+
stable_before.append(_row_col0(prev_tr))
|
| 245 |
+
if len(stable_before) >= 3:
|
| 246 |
+
break
|
| 247 |
+
|
| 248 |
+
after_anchor = stable_before[0] if stable_before else ''
|
| 249 |
+
context_rows_before = stable_before[1:]
|
| 250 |
|
| 251 |
cells = []
|
| 252 |
for tc in tr.findall(qn('w:tc')):
|
|
|
|
| 288 |
'kind': 'table_row',
|
| 289 |
'table_header': header_key,
|
| 290 |
'after_row_anchor': after_anchor,
|
| 291 |
+
'context_rows_before': context_rows_before,
|
| 292 |
'section_heading': section_heading,
|
| 293 |
+
'section_number': section_number,
|
| 294 |
},
|
| 295 |
'cells': cells,
|
| 296 |
})
|
|
|
|
| 312 |
'row_anchor': row_anchor,
|
| 313 |
'col_idx': col_idx,
|
| 314 |
'section_heading': section_heading,
|
| 315 |
+
'section_number': section_number,
|
| 316 |
},
|
| 317 |
'old': old_text,
|
| 318 |
'new': new_text,
|
|
|
|
| 339 |
from lxml import etree
|
| 340 |
|
| 341 |
prev_stable_text = ''
|
| 342 |
+
current_section = ''
|
| 343 |
|
| 344 |
# ββ Section-replace accumulator βββββββββββββββββββββββββββββββββββββββββββ
|
| 345 |
sec_del = [] # fully-deleted elements (CR del block)
|
|
|
|
| 388 |
'del_heading': del_heading,
|
| 389 |
'has_del_table': has_del_table,
|
| 390 |
'anchor_text': sec_anchor,
|
| 391 |
+
'section_number': current_section,
|
| 392 |
},
|
| 393 |
'elements_xml': elements_xml,
|
| 394 |
})
|
|
|
|
| 420 |
'location': {
|
| 421 |
'kind': 'body',
|
| 422 |
'anchor_text': prev_stable_text,
|
| 423 |
+
'section_number': current_section,
|
| 424 |
},
|
| 425 |
'paragraphs': paras,
|
| 426 |
})
|
|
|
|
| 430 |
tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
|
| 431 |
|
| 432 |
if tag == 'p':
|
| 433 |
+
hn = _heading_number(elem)
|
| 434 |
+
if hn:
|
| 435 |
+
current_section = hn
|
| 436 |
+
|
| 437 |
is_del = _is_deleted_para(elem)
|
| 438 |
is_ins = _is_inserted_para(elem)
|
| 439 |
is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
|
|
|
|
| 484 |
'location': {
|
| 485 |
'kind': 'body_para',
|
| 486 |
'para_context': _para_orig_text(elem).strip(),
|
| 487 |
+
'section_number': current_section,
|
| 488 |
},
|
| 489 |
'old': old_text,
|
| 490 |
'new': new_text,
|
| 491 |
})
|
| 492 |
|
| 493 |
orig = _para_orig_text(elem).strip()
|
| 494 |
+
if orig and not _is_skip_marker(orig):
|
| 495 |
prev_stable_text = orig
|
| 496 |
|
| 497 |
elif tag == 'tbl':
|
|
|
|
| 519 |
# Table with inline cell changes
|
| 520 |
flush_section()
|
| 521 |
flush_group()
|
| 522 |
+
_parse_table(elem, changes, section_heading=prev_stable_text,
|
| 523 |
+
section_number=current_section)
|
| 524 |
|
| 525 |
flush_section()
|
| 526 |
flush_group()
|
scripts/etsi_client.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
| 2 |
etsi_client.py β ETSI document download helpers for ApplyCRs.
|
| 3 |
|
| 4 |
Provides:
|
| 5 |
-
|
| 6 |
-
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
|
@@ -27,7 +27,7 @@ def _get_proxies() -> dict:
|
|
| 27 |
return {"http": proxy, "https": proxy}
|
| 28 |
|
| 29 |
|
| 30 |
-
class
|
| 31 |
HEADERS = {
|
| 32 |
"User-Agent": (
|
| 33 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
@@ -156,7 +156,7 @@ class ETSIDocFinder:
|
|
| 156 |
)
|
| 157 |
|
| 158 |
|
| 159 |
-
class
|
| 160 |
def __init__(self, eol_user: str, eol_password: str):
|
| 161 |
self.eol_user = eol_user
|
| 162 |
self.eol_password = eol_password
|
|
@@ -458,11 +458,26 @@ class ETSISpecFinder:
|
|
| 458 |
|
| 459 |
if not versioned_urls:
|
| 460 |
found_names = [u.split("/")[-1] for u in matching_urls]
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
matching_urls = versioned_urls
|
| 468 |
|
|
|
|
| 2 |
etsi_client.py β ETSI document download helpers for ApplyCRs.
|
| 3 |
|
| 4 |
Provides:
|
| 5 |
+
CRFetcher β CR TDoc downloads via docbox.etsi.org
|
| 6 |
+
TSFetcher β TS DOCX downloads via portal.etsi.org WKI chain
|
| 7 |
"""
|
| 8 |
|
| 9 |
import json
|
|
|
|
| 27 |
return {"http": proxy, "https": proxy}
|
| 28 |
|
| 29 |
|
| 30 |
+
class CRFetcher:
|
| 31 |
HEADERS = {
|
| 32 |
"User-Agent": (
|
| 33 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
|
| 156 |
)
|
| 157 |
|
| 158 |
|
| 159 |
+
class TSFetcher:
|
| 160 |
def __init__(self, eol_user: str, eol_password: str):
|
| 161 |
self.eol_user = eol_user
|
| 162 |
self.eol_password = eol_password
|
|
|
|
| 458 |
|
| 459 |
if not versioned_urls:
|
| 460 |
found_names = [u.split("/")[-1] for u in matching_urls]
|
| 461 |
+
# Decode the available version from the first filename (e.g. v160500 β 16.5.0)
|
| 462 |
+
avail_ver = None
|
| 463 |
+
if found_names:
|
| 464 |
+
m = re.search(r'v(\d{6})p?', found_names[0])
|
| 465 |
+
if m:
|
| 466 |
+
t = m.group(1)
|
| 467 |
+
avail_ver = f"{int(t[0:2])}.{int(t[2:4])}.{int(t[4:6])}"
|
| 468 |
+
if avail_ver:
|
| 469 |
+
print(
|
| 470 |
+
f"\n *** WARNING ***\n"
|
| 471 |
+
f" TS {doc_id} v{version_str} is not available on the ETSI portal.\n"
|
| 472 |
+
f" Portal has v{avail_ver} (file: {found_names[0]}).\n"
|
| 473 |
+
f" Options: target v{avail_ver} in your CR, or drop the TS DOCX manually.\n"
|
| 474 |
+
)
|
| 475 |
+
else:
|
| 476 |
+
print(
|
| 477 |
+
f" wki_id={wki_id}: version tag not in filenames {found_names}, "
|
| 478 |
+
f"rejecting (wrong version would be downloaded)"
|
| 479 |
+
)
|
| 480 |
+
return None
|
| 481 |
|
| 482 |
matching_urls = versioned_urls
|
| 483 |
|
scripts/fetch_crs.py
CHANGED
|
@@ -20,7 +20,7 @@ import sys
|
|
| 20 |
import zipfile
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
-
from etsi_client import
|
| 24 |
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------------
|
|
@@ -218,10 +218,13 @@ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
|
|
| 218 |
dest = cr_dir / f"{uid}.docx"
|
| 219 |
|
| 220 |
if dest.exists():
|
|
|
|
|
|
|
|
|
|
| 221 |
return dest, "already existed"
|
| 222 |
|
| 223 |
try:
|
| 224 |
-
finder =
|
| 225 |
url = finder.search_document(uid)
|
| 226 |
if isinstance(url, str) and "not found" in url.lower():
|
| 227 |
return None, f"document not found: {uid}"
|
|
@@ -373,7 +376,7 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
|
|
| 373 |
return filename, "already existed"
|
| 374 |
|
| 375 |
try:
|
| 376 |
-
finder =
|
| 377 |
tmp_path = finder.search_document_docx(spec_number, version)
|
| 378 |
except Exception as e:
|
| 379 |
return None, f"download error: {e}"
|
|
@@ -391,7 +394,7 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
|
|
| 391 |
dest.unlink()
|
| 392 |
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
|
| 393 |
|
| 394 |
-
# Verify the TS contains the expected spec number in its first paragraph
|
| 395 |
try:
|
| 396 |
import docx as _docx
|
| 397 |
_doc = _docx.Document(dest)
|
|
@@ -402,6 +405,12 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
|
|
| 402 |
f"wrong TS returned: got {first_para[:80]!r} "
|
| 403 |
f"(expected spec {spec_no_space})"
|
| 404 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
except Exception:
|
| 406 |
pass # Trust the ZIP check above
|
| 407 |
|
|
|
|
| 20 |
import zipfile
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
+
from etsi_client import CRFetcher, TSFetcher
|
| 24 |
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------------
|
|
|
|
| 218 |
dest = cr_dir / f"{uid}.docx"
|
| 219 |
|
| 220 |
if dest.exists():
|
| 221 |
+
extracted = cr_dir / f"{uid}_extracted.docx"
|
| 222 |
+
if extracted.exists():
|
| 223 |
+
return extracted, "already existed"
|
| 224 |
return dest, "already existed"
|
| 225 |
|
| 226 |
try:
|
| 227 |
+
finder = CRFetcher(eol_user, eol_password)
|
| 228 |
url = finder.search_document(uid)
|
| 229 |
if isinstance(url, str) and "not found" in url.lower():
|
| 230 |
return None, f"document not found: {uid}"
|
|
|
|
| 376 |
return filename, "already existed"
|
| 377 |
|
| 378 |
try:
|
| 379 |
+
finder = TSFetcher(eol_user, eol_password)
|
| 380 |
tmp_path = finder.search_document_docx(spec_number, version)
|
| 381 |
except Exception as e:
|
| 382 |
return None, f"download error: {e}"
|
|
|
|
| 394 |
dest.unlink()
|
| 395 |
return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
|
| 396 |
|
| 397 |
+
# Verify the TS contains the expected spec number AND version in its first paragraph
|
| 398 |
try:
|
| 399 |
import docx as _docx
|
| 400 |
_doc = _docx.Document(dest)
|
|
|
|
| 405 |
f"wrong TS returned: got {first_para[:80]!r} "
|
| 406 |
f"(expected spec {spec_no_space})"
|
| 407 |
)
|
| 408 |
+
if f"V{version}" not in first_para:
|
| 409 |
+
dest.unlink()
|
| 410 |
+
return None, (
|
| 411 |
+
f"wrong version returned: got {first_para[:80]!r} "
|
| 412 |
+
f"(expected V{version})"
|
| 413 |
+
)
|
| 414 |
except Exception:
|
| 415 |
pass # Trust the ZIP check above
|
| 416 |
|
scripts/finalize_ts.py
CHANGED
|
@@ -171,11 +171,12 @@ def _detect_meeting_separator(tbl):
|
|
| 171 |
Returns the detected separator character, defaulting to '#'.
|
| 172 |
"""
|
| 173 |
meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
| 179 |
for row in reversed(tbl.rows):
|
| 180 |
cells = row.cells
|
| 181 |
if len(cells) > meet_col:
|
|
@@ -196,25 +197,21 @@ class NoChangeHistoryTable(Exception):
|
|
| 196 |
|
| 197 |
def find_change_history_table(ts_doc):
|
| 198 |
"""
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
Raises NoChangeHistoryTable
|
| 204 |
-
can distinguish a structural absence from an unexpected error.
|
| 205 |
"""
|
| 206 |
-
for tbl in
|
| 207 |
-
|
| 208 |
-
if ncols not in (8, 9):
|
| 209 |
continue
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
if {'cr', 'date'}.issubset(header_words):
|
| 214 |
-
return tbl
|
| 215 |
raise NoChangeHistoryTable(
|
| 216 |
'No Change History table found in this document '
|
| 217 |
-
'(no table
|
| 218 |
)
|
| 219 |
|
| 220 |
|
|
@@ -233,7 +230,9 @@ def find_history_table(ts_doc):
|
|
| 233 |
|
| 234 |
def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
|
| 235 |
tbl = find_change_history_table(ts_doc)
|
| 236 |
-
|
|
|
|
|
|
|
| 237 |
|
| 238 |
# Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
|
| 239 |
# and reformat meeting_id accordingly so it matches the existing style.
|
|
|
|
| 171 |
Returns the detected separator character, defaulting to '#'.
|
| 172 |
"""
|
| 173 |
meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
|
| 174 |
+
# row[0] is the "Change history" title; row[1] is the column header row
|
| 175 |
+
header_row = tbl.rows[1] if len(tbl.rows) > 1 else tbl.rows[0]
|
| 176 |
+
for c_idx, cell in enumerate(header_row.cells):
|
| 177 |
+
if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
|
| 178 |
+
meet_col = c_idx
|
| 179 |
+
break
|
| 180 |
for row in reversed(tbl.rows):
|
| 181 |
cells = row.cells
|
| 182 |
if len(cells) > meet_col:
|
|
|
|
| 197 |
|
| 198 |
def find_change_history_table(ts_doc):
|
| 199 |
"""
|
| 200 |
+
Find the Change History table by looking for a first row whose text
|
| 201 |
+
contains "Change history" (the merged title cell that ETSI places at the
|
| 202 |
+
top of the annex table).
|
| 203 |
+
|
| 204 |
+
Raises NoChangeHistoryTable when no such table is found.
|
|
|
|
| 205 |
"""
|
| 206 |
+
for tbl in ts_doc.tables:
|
| 207 |
+
if not tbl.rows:
|
|
|
|
| 208 |
continue
|
| 209 |
+
r0_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells)
|
| 210 |
+
if 'Change history' in r0_text:
|
| 211 |
+
return tbl
|
|
|
|
|
|
|
| 212 |
raise NoChangeHistoryTable(
|
| 213 |
'No Change History table found in this document '
|
| 214 |
+
'(no table whose first row contains "Change history")'
|
| 215 |
)
|
| 216 |
|
| 217 |
|
|
|
|
| 230 |
|
| 231 |
def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
|
| 232 |
tbl = find_change_history_table(ts_doc)
|
| 233 |
+
# row[0] is the "Change history" title (merged); row[1] is the column header row
|
| 234 |
+
header_row = tbl.rows[1] if len(tbl.rows) > 1 else tbl.rows[0]
|
| 235 |
+
ncols = len(header_row.cells)
|
| 236 |
|
| 237 |
# Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
|
| 238 |
# and reformat meeting_id accordingly so it matches the existing style.
|
scripts/orchestrate_cr.py
CHANGED
|
@@ -169,7 +169,7 @@ def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
|
|
| 169 |
|
| 170 |
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 171 |
try:
|
| 172 |
-
n_ok, n_skip, log_lines = apply_manifest(
|
| 173 |
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 174 |
)
|
| 175 |
except Exception as e:
|
|
@@ -183,7 +183,8 @@ def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
|
|
| 183 |
for line in log_lines:
|
| 184 |
if line.strip().startswith('ERROR'):
|
| 185 |
errors.append(line.strip())
|
| 186 |
-
print(f'
|
|
|
|
| 187 |
|
| 188 |
print(' Finalising metadata...')
|
| 189 |
ts_final_or_applied = ts_applied # fallback if finalise raises
|
|
|
|
| 169 |
|
| 170 |
print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
|
| 171 |
try:
|
| 172 |
+
n_ok, n_skip, log_lines, n_parsed, n_merged = apply_manifest(
|
| 173 |
ts_in, combined_manifest, ts_applied, author=author, date=tc_date
|
| 174 |
)
|
| 175 |
except Exception as e:
|
|
|
|
| 183 |
for line in log_lines:
|
| 184 |
if line.strip().startswith('ERROR'):
|
| 185 |
errors.append(line.strip())
|
| 186 |
+
print(f' Parsed: {n_parsed} body changes (merged to {n_merged} groups)'
|
| 187 |
+
f' β Applied: {n_ok} Skipped: {n_skip}')
|
| 188 |
|
| 189 |
print(' Finalising metadata...')
|
| 190 |
ts_final_or_applied = ts_applied # fallback if finalise raises
|
scripts/ts_applicator.py
CHANGED
|
@@ -51,6 +51,7 @@ _UNICODE_REPLACEMENTS = (
|
|
| 51 |
('\u2019', "'"), # right single quote
|
| 52 |
('\u201c', '"'), # left double quote
|
| 53 |
('\u201d', '"'), # right double quote
|
|
|
|
| 54 |
)
|
| 55 |
|
| 56 |
|
|
@@ -123,26 +124,50 @@ def _full_para_text(para):
|
|
| 123 |
''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
|
| 124 |
|
| 125 |
|
| 126 |
-
def
|
| 127 |
-
"""
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
| 137 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
norm_search = _norm(search_text)
|
| 139 |
ws_search = _norm_ws(search_text)
|
| 140 |
candidates_exact = []
|
| 141 |
candidates_norm = []
|
| 142 |
candidates_ws = []
|
|
|
|
| 143 |
candidates_del = []
|
| 144 |
|
| 145 |
-
for para in
|
| 146 |
pt = para.text
|
| 147 |
if search_text in pt:
|
| 148 |
candidates_exact.append(para)
|
|
@@ -151,28 +176,157 @@ def _find_para(doc, search_text, prefer_not_in_table=False):
|
|
| 151 |
elif ws_search and ws_search in _norm_ws(pt):
|
| 152 |
candidates_ws.append(para)
|
| 153 |
else:
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
elif ws_search and ws_search in _norm_ws(
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
def _in_table(para):
|
| 162 |
p = para._element
|
| 163 |
return any(a.tag == qn('w:tc') for a in p.iterancestors())
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
|
| 166 |
-
(candidates_ws, 0.8), (
|
|
|
|
| 167 |
if not pool:
|
| 168 |
continue
|
| 169 |
-
if
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
|
|
|
| 174 |
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
|
| 178 |
def _find_table_by_section(doc, section_heading):
|
|
@@ -233,11 +387,33 @@ def _find_table(doc, header_key):
|
|
| 233 |
return None, 0.0
|
| 234 |
|
| 235 |
|
| 236 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
"""
|
| 238 |
Find first row in tbl where col-0 cell text contains anchor_text.
|
| 239 |
Returns (row_idx, confidence) or (-1, 0.0).
|
| 240 |
|
|
|
|
|
|
|
|
|
|
| 241 |
Matching levels, in order of confidence:
|
| 242 |
1.0 β exact substring match
|
| 243 |
0.9 β Unicode-normalised match (_norm: xa0, dashes, quotes, β¦)
|
|
@@ -248,38 +424,33 @@ def _find_row(tbl, anchor_text):
|
|
| 248 |
0.5 β clean-prefix + token-overlap: when multiple rows share the prefix,
|
| 249 |
pick the one whose col-0 tokens overlap most with the anchor tokens.
|
| 250 |
"""
|
|
|
|
| 251 |
norm_anchor = _norm(anchor_text)
|
| 252 |
ws_anchor = _norm_ws(anchor_text)
|
| 253 |
alnum_anchor = _norm_alnum(anchor_text)
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
| 269 |
|
| 270 |
# ββ Prefix-based partial match βββββββββββββββββββββββββββββββββββββββββββββ
|
| 271 |
-
# The anchor may have Unicode chars embedded mid-text that prevent all string
|
| 272 |
-
# comparisons above from matching, even after normalisation (e.g. when the CR
|
| 273 |
-
# extracts '\xa0' between spec-number parts but the TS has different encoding).
|
| 274 |
-
# Strategy: use only the clean ASCII prefix of the anchor as the search key.
|
| 275 |
-
# If that prefix is found in exactly one row β we've uniquely identified it.
|
| 276 |
-
# If it appears in several rows β pick the one whose full token set overlaps
|
| 277 |
-
# most with the anchor's tokens (the user's described disambiguation rule).
|
| 278 |
prefix = _clean_prefix(anchor_text)
|
| 279 |
if prefix and len(prefix) > 8:
|
| 280 |
prefix_low = prefix.lower()
|
| 281 |
hits = [
|
| 282 |
-
idx for idx, row in enumerate(
|
| 283 |
if row.cells and prefix_low in row.cells[0].text.lower()
|
| 284 |
]
|
| 285 |
if len(hits) == 1:
|
|
@@ -289,7 +460,7 @@ def _find_row(tbl, anchor_text):
|
|
| 289 |
best_score, best_idx = -1, -1
|
| 290 |
for hit_idx in hits:
|
| 291 |
cell_tokens = set(re.findall(r'[a-z0-9]+',
|
| 292 |
-
|
| 293 |
score = len(anchor_tokens & cell_tokens)
|
| 294 |
if score > best_score:
|
| 295 |
best_score, best_idx = score, hit_idx
|
|
@@ -401,29 +572,77 @@ def _apply_section_replace(doc, change, rev, author, date, log):
|
|
| 401 |
loc = change['location']
|
| 402 |
del_heading = loc.get('del_heading', '')
|
| 403 |
has_del_table = loc.get('has_del_table', False)
|
|
|
|
| 404 |
elements_xml = change.get('elements_xml', [])
|
| 405 |
|
| 406 |
if not elements_xml:
|
| 407 |
log.append(' SKIP section_replace: no elements in manifest')
|
| 408 |
return False
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
# ββ Find the TS paragraph that matches the deleted heading βββββββββββββββββ
|
| 411 |
ts_para_elem = None
|
|
|
|
| 412 |
if del_heading:
|
| 413 |
-
for para in
|
| 414 |
pt = para.text
|
| 415 |
if del_heading in pt or _norm(del_heading) in _norm(pt):
|
| 416 |
ts_para_elem = para._element
|
| 417 |
break
|
| 418 |
if ts_para_elem is None:
|
| 419 |
# Fallback: include paragraphs whose XML text (inc. del runs) matches
|
| 420 |
-
for para in
|
| 421 |
if del_heading in _full_para_text(para):
|
| 422 |
ts_para_elem = para._element
|
| 423 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 424 |
|
| 425 |
if ts_para_elem is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
return False
|
| 428 |
|
| 429 |
ts_body = ts_para_elem.getparent()
|
|
@@ -475,13 +694,16 @@ def _apply_section_replace(doc, change, rev, author, date, log):
|
|
| 475 |
el.set(qn('w:id'), rev.next())
|
| 476 |
cloned.append(cloned_elem)
|
| 477 |
|
| 478 |
-
# ββ Insert cloned elements before
|
| 479 |
insert_idx = list(ts_body).index(ts_para_elem)
|
|
|
|
|
|
|
| 480 |
for i, elem in enumerate(cloned):
|
| 481 |
ts_body.insert(insert_idx + i, elem)
|
| 482 |
|
| 483 |
-
# ββ Remove the now-replaced TS elements ββ
|
| 484 |
-
|
|
|
|
| 485 |
if ts_tbl_elem is not None:
|
| 486 |
ts_body.remove(ts_tbl_elem)
|
| 487 |
|
|
@@ -580,14 +802,24 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 580 |
|
| 581 |
elif loc['kind'] == 'body_para':
|
| 582 |
ctx = loc.get('para_context', '')
|
| 583 |
-
|
| 584 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
if para is None:
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
if para is None:
|
| 589 |
-
log.append(f" ERROR text_replace: old text {old!r} not found in TS")
|
| 590 |
-
return False
|
| 591 |
if old in para.text:
|
| 592 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 593 |
log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β {new!r}")
|
|
@@ -600,14 +832,26 @@ def _apply_text_replace(doc, change, rev, author, date, log):
|
|
| 600 |
|
| 601 |
|
| 602 |
def _apply_para_insert(doc, change, rev, author, date, log):
|
| 603 |
-
|
|
|
|
|
|
|
| 604 |
paras_data = change.get('paragraphs', [])
|
| 605 |
if not paras_data:
|
| 606 |
return True
|
| 607 |
|
| 608 |
-
anchor_para, conf =
|
|
|
|
| 609 |
if anchor_para is None:
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
return False
|
| 612 |
|
| 613 |
items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
|
|
@@ -633,7 +877,8 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
|
|
| 633 |
return False
|
| 634 |
|
| 635 |
after_anchor = loc.get('after_row_anchor', '')
|
| 636 |
-
|
|
|
|
| 637 |
if row_idx < 0:
|
| 638 |
log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
|
| 639 |
return False
|
|
@@ -697,7 +942,7 @@ def _merge_para_inserts(manifest):
|
|
| 697 |
def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
|
| 698 |
"""
|
| 699 |
Apply all changes in manifest to ts_path, save to out_path.
|
| 700 |
-
Returns (n_ok, n_skipped, log_lines).
|
| 701 |
"""
|
| 702 |
doc = docx.Document(str(ts_path))
|
| 703 |
rev = RevCounter(doc)
|
|
@@ -705,7 +950,9 @@ def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFA
|
|
| 705 |
n_ok = 0
|
| 706 |
n_skip = 0
|
| 707 |
|
|
|
|
| 708 |
manifest = _merge_para_inserts(manifest)
|
|
|
|
| 709 |
|
| 710 |
# Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
|
| 711 |
# forward insertion order when multiple row_inserts target the same anchor.
|
|
@@ -732,7 +979,7 @@ def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFA
|
|
| 732 |
n_skip += 1
|
| 733 |
|
| 734 |
doc.save(str(out_path))
|
| 735 |
-
return n_ok, n_skip, log
|
| 736 |
|
| 737 |
|
| 738 |
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -752,11 +999,11 @@ def main():
|
|
| 752 |
manifest = json.load(f)
|
| 753 |
|
| 754 |
print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
|
| 755 |
-
n_ok, n_skip, log = apply_manifest(ts_path, manifest, out_path, author=args.author)
|
| 756 |
|
| 757 |
for line in log:
|
| 758 |
print(line)
|
| 759 |
-
print(f'\
|
| 760 |
print(f'Output: {out_path}')
|
| 761 |
|
| 762 |
|
|
|
|
| 51 |
('\u2019', "'"), # right single quote
|
| 52 |
('\u201c', '"'), # left double quote
|
| 53 |
('\u201d', '"'), # right double quote
|
| 54 |
+
('\u2026', '...'), # horizontal ellipsis β three dots
|
| 55 |
)
|
| 56 |
|
| 57 |
|
|
|
|
| 124 |
''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
|
| 125 |
|
| 126 |
|
| 127 |
+
def _original_para_text(para):
|
| 128 |
+
"""Reconstruct paragraph text as it was before tracked changes.
|
| 129 |
+
|
| 130 |
+
Iterates in document order, keeping:
|
| 131 |
+
- w:t runs that are NOT inside a w:ins element (stable text)
|
| 132 |
+
- w:delText runs (deleted-but-original text)
|
| 133 |
+
Skipping:
|
| 134 |
+
- w:t runs inside w:ins (newly inserted text)
|
| 135 |
+
|
| 136 |
+
This allows anchors that reference original phrasing (e.g. 'SCP81Connection')
|
| 137 |
+
to still match after a tracked '1'β'X' replacement has been applied to that
|
| 138 |
+
paragraph β where _full_para_text would return the concatenation out of order.
|
| 139 |
"""
|
| 140 |
+
el = para._element
|
| 141 |
+
result = []
|
| 142 |
+
for node in el.iter():
|
| 143 |
+
if node.tag == qn('w:t'):
|
| 144 |
+
# Skip if this w:t is wrapped in a w:ins element
|
| 145 |
+
is_inserted = False
|
| 146 |
+
for anc in node.iterancestors():
|
| 147 |
+
if anc is el:
|
| 148 |
+
break
|
| 149 |
+
if anc.tag == qn('w:ins'):
|
| 150 |
+
is_inserted = True
|
| 151 |
+
break
|
| 152 |
+
if not is_inserted:
|
| 153 |
+
result.append(node.text or '')
|
| 154 |
+
elif node.tag == qn('w:delText'):
|
| 155 |
+
result.append(node.text or '')
|
| 156 |
+
return ''.join(result)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def _match_paragraphs(paragraphs, search_text, prefer_not_in_table=False):
|
| 160 |
+
"""Core 5-tier matching logic. Operates on any iterable of Paragraph objects.
|
| 161 |
+
Returns (para, confidence) or (None, 0.0)."""
|
| 162 |
norm_search = _norm(search_text)
|
| 163 |
ws_search = _norm_ws(search_text)
|
| 164 |
candidates_exact = []
|
| 165 |
candidates_norm = []
|
| 166 |
candidates_ws = []
|
| 167 |
+
candidates_orig = []
|
| 168 |
candidates_del = []
|
| 169 |
|
| 170 |
+
for para in paragraphs:
|
| 171 |
pt = para.text
|
| 172 |
if search_text in pt:
|
| 173 |
candidates_exact.append(para)
|
|
|
|
| 176 |
elif ws_search and ws_search in _norm_ws(pt):
|
| 177 |
candidates_ws.append(para)
|
| 178 |
else:
|
| 179 |
+
orig_pt = _original_para_text(para)
|
| 180 |
+
if (search_text in orig_pt
|
| 181 |
+
or (norm_search and norm_search in _norm(orig_pt))):
|
| 182 |
+
candidates_orig.append(para)
|
| 183 |
+
elif ws_search and ws_search in _norm_ws(orig_pt):
|
| 184 |
+
candidates_orig.append(para)
|
| 185 |
+
else:
|
| 186 |
+
full_pt = _full_para_text(para)
|
| 187 |
+
if search_text in full_pt:
|
| 188 |
+
candidates_del.append(para)
|
| 189 |
+
elif ws_search and ws_search in _norm_ws(full_pt):
|
| 190 |
+
candidates_del.append(para)
|
| 191 |
|
| 192 |
def _in_table(para):
|
| 193 |
p = para._element
|
| 194 |
return any(a.tag == qn('w:tc') for a in p.iterancestors())
|
| 195 |
|
| 196 |
+
if not prefer_not_in_table:
|
| 197 |
+
for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
|
| 198 |
+
(candidates_ws, 0.8), (candidates_orig, 0.7),
|
| 199 |
+
(candidates_del, 0.6)]:
|
| 200 |
+
if pool:
|
| 201 |
+
return pool[0], conf
|
| 202 |
+
return None, 0.0
|
| 203 |
+
|
| 204 |
+
best_table_match = (None, 0.0)
|
| 205 |
for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
|
| 206 |
+
(candidates_ws, 0.8), (candidates_orig, 0.7),
|
| 207 |
+
(candidates_del, 0.6)]:
|
| 208 |
if not pool:
|
| 209 |
continue
|
| 210 |
+
body_only = [p for p in pool if not _in_table(p)]
|
| 211 |
+
if body_only:
|
| 212 |
+
return body_only[0], conf
|
| 213 |
+
if best_table_match[0] is None:
|
| 214 |
+
best_table_match = (pool[0], conf)
|
| 215 |
+
return best_table_match if best_table_match[0] is not None else (None, 0.0)
|
| 216 |
|
| 217 |
+
|
| 218 |
+
def _find_para(doc, search_text, prefer_not_in_table=False):
|
| 219 |
+
"""Find the first paragraph containing search_text across the entire doc.
|
| 220 |
+
Five-tier matching (see _match_paragraphs). Returns (para, confidence)."""
|
| 221 |
+
return _match_paragraphs(doc.paragraphs, search_text, prefer_not_in_table)
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
# ββ Section-aware anchor search βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 225 |
+
|
| 226 |
+
_HEADING_NUM_RE = re.compile(r'^(\d+(?:\.\d+)*)\s+\S')
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def _para_heading_number(para):
|
| 230 |
+
"""Dotted section number if this paragraph is a real TS heading, else None.
|
| 231 |
+
Requires the paragraph style to start with 'Heading' (case-insensitive) β this
|
| 232 |
+
rejects false positives from TOC entries (style 'toc N'), address lines in the
|
| 233 |
+
front matter (style 'FP'), change history labels (style 'B3'), etc. ETSI/3GPP
|
| 234 |
+
TS documents always style real headings as 'Heading 1'..'Heading N'."""
|
| 235 |
+
style_name = (para.style.name if para.style is not None else '') or ''
|
| 236 |
+
if not style_name.lower().startswith('heading'):
|
| 237 |
+
return None
|
| 238 |
+
m = _HEADING_NUM_RE.match(para.text.strip())
|
| 239 |
+
return m.group(1) if m else None
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def _is_descendant_section(child, parent):
|
| 243 |
+
"""True if `child` is `parent` or nested under it (by dotted-prefix)."""
|
| 244 |
+
return child == parent or child.startswith(parent + '.')
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def _section_range(doc, target):
|
| 248 |
+
"""Return (start_idx, end_idx) into doc.paragraphs spanning the target section.
|
| 249 |
+
start = index of the heading whose number == target.
|
| 250 |
+
end = index of the next heading whose number is NOT a descendant of target
|
| 251 |
+
(or len(doc.paragraphs) if none).
|
| 252 |
+
Returns (None, None) if target heading not found. Recomputed per-call."""
|
| 253 |
+
paras = doc.paragraphs
|
| 254 |
+
start = None
|
| 255 |
+
for i, p in enumerate(paras):
|
| 256 |
+
n = _para_heading_number(p)
|
| 257 |
+
if n is None:
|
| 258 |
+
continue
|
| 259 |
+
if start is None and n == target:
|
| 260 |
+
start = i
|
| 261 |
+
continue
|
| 262 |
+
if start is not None and not _is_descendant_section(n, target):
|
| 263 |
+
return (start, i)
|
| 264 |
+
return (start, len(paras)) if start is not None else (None, None)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def _enclosing_heading(doc, para):
|
| 268 |
+
"""Walk backward from para to the first preceding heading paragraph.
|
| 269 |
+
Returns the heading Paragraph or None. Used for HINT lines."""
|
| 270 |
+
paras = doc.paragraphs
|
| 271 |
+
target_elem = para._element
|
| 272 |
+
start_idx = None
|
| 273 |
+
for i, p in enumerate(paras):
|
| 274 |
+
if p._element is target_elem:
|
| 275 |
+
start_idx = i
|
| 276 |
+
break
|
| 277 |
+
if start_idx is None:
|
| 278 |
+
return None
|
| 279 |
+
for i in range(start_idx, -1, -1):
|
| 280 |
+
if _para_heading_number(paras[i]) is not None:
|
| 281 |
+
return paras[i]
|
| 282 |
+
return None
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _find_para_in_section(doc, search_text, section_number, prefer_not_in_table=False):
|
| 286 |
+
"""Section-restricted _find_para. Returns (para, conf, status) where
|
| 287 |
+
status β {"in_section", "no_section"}. On no_section, caller should
|
| 288 |
+
fall back to global _find_para with a WARN log line."""
|
| 289 |
+
if not section_number:
|
| 290 |
+
return (None, 0.0, 'no_section')
|
| 291 |
+
start, end = _section_range(doc, section_number)
|
| 292 |
+
if start is None:
|
| 293 |
+
return (None, 0.0, 'no_section')
|
| 294 |
+
para, conf = _match_paragraphs(doc.paragraphs[start:end], search_text,
|
| 295 |
+
prefer_not_in_table)
|
| 296 |
+
return (para, conf, 'in_section')
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def _find_para_with_section(doc, search_text, section_number, kind_label, log,
|
| 300 |
+
prefer_not_in_table=False):
|
| 301 |
+
"""Section-aware anchor search with WARN/ERROR logging.
|
| 302 |
+
Behaviour:
|
| 303 |
+
* section_number present + found in TS + anchor in range β return (para, conf).
|
| 304 |
+
* section_number present + not in TS β WARN, fall back to global _find_para.
|
| 305 |
+
* section_number present + anchor NOT in range β ERROR + HINT, return (None, 0).
|
| 306 |
+
* section_number missing β WARN, fall back to global _find_para.
|
| 307 |
+
Logs go to `log` (list of str)."""
|
| 308 |
+
if section_number:
|
| 309 |
+
para, conf, status = _find_para_in_section(
|
| 310 |
+
doc, search_text, section_number, prefer_not_in_table)
|
| 311 |
+
if status == 'in_section' and para is not None:
|
| 312 |
+
return para, conf
|
| 313 |
+
if status == 'no_section':
|
| 314 |
+
log.append(f" WARN section '{section_number}' not found in TS β falling back to global search")
|
| 315 |
+
return _find_para(doc, search_text, prefer_not_in_table)
|
| 316 |
+
# in_section but anchor absent β check global for HINT
|
| 317 |
+
g_para, _ = _find_para(doc, search_text, prefer_not_in_table)
|
| 318 |
+
if g_para is not None:
|
| 319 |
+
enc = _enclosing_heading(doc, g_para)
|
| 320 |
+
actual = _para_heading_number(enc) if enc is not None else '?'
|
| 321 |
+
log.append(f" ERROR {kind_label}: anchor {search_text[:60]!r} declared in section "
|
| 322 |
+
f"{section_number} but found in section {actual}")
|
| 323 |
+
log.append(f" HINT nearest match: {g_para.text[:120]!r}")
|
| 324 |
+
else:
|
| 325 |
+
log.append(f" ERROR {kind_label}: anchor {search_text[:60]!r} not found in section "
|
| 326 |
+
f"{section_number} (or anywhere)")
|
| 327 |
+
return None, 0.0
|
| 328 |
+
log.append(f" WARN no section_number on change β global anchor search for {search_text[:60]!r}")
|
| 329 |
+
return _find_para(doc, search_text, prefer_not_in_table)
|
| 330 |
|
| 331 |
|
| 332 |
def _find_table_by_section(doc, section_heading):
|
|
|
|
| 387 |
return None, 0.0
|
| 388 |
|
| 389 |
|
| 390 |
+
def _disambiguate_by_context(all_rows, candidates, context_rows_before):
|
| 391 |
+
"""Pick the candidate whose preceding rows best match context_rows_before.
|
| 392 |
+
context_rows_before: list of expected col-0 texts, closest-first.
|
| 393 |
+
Returns the best candidate index; falls back to candidates[0] on tie."""
|
| 394 |
+
best_score, best_idx = -1, candidates[0]
|
| 395 |
+
for idx in candidates:
|
| 396 |
+
score = 0
|
| 397 |
+
for depth, expected in enumerate(context_rows_before, start=1):
|
| 398 |
+
ctx_idx = idx - depth
|
| 399 |
+
if ctx_idx < 0 or not expected:
|
| 400 |
+
continue
|
| 401 |
+
cell0 = all_rows[ctx_idx].cells[0].text if all_rows[ctx_idx].cells else ''
|
| 402 |
+
if _norm(expected) in _norm(cell0) or _norm_ws(expected) in _norm_ws(cell0):
|
| 403 |
+
score += 1
|
| 404 |
+
if score > best_score:
|
| 405 |
+
best_score, best_idx = score, idx
|
| 406 |
+
return best_idx
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
def _find_row(tbl, anchor_text, context_rows_before=None):
|
| 410 |
"""
|
| 411 |
Find first row in tbl where col-0 cell text contains anchor_text.
|
| 412 |
Returns (row_idx, confidence) or (-1, 0.0).
|
| 413 |
|
| 414 |
+
When context_rows_before is provided and multiple rows match, uses the
|
| 415 |
+
col-0 texts of the rows preceding each candidate to disambiguate.
|
| 416 |
+
|
| 417 |
Matching levels, in order of confidence:
|
| 418 |
1.0 β exact substring match
|
| 419 |
0.9 β Unicode-normalised match (_norm: xa0, dashes, quotes, β¦)
|
|
|
|
| 424 |
0.5 β clean-prefix + token-overlap: when multiple rows share the prefix,
|
| 425 |
pick the one whose col-0 tokens overlap most with the anchor tokens.
|
| 426 |
"""
|
| 427 |
+
all_rows = list(tbl.rows)
|
| 428 |
norm_anchor = _norm(anchor_text)
|
| 429 |
ws_anchor = _norm_ws(anchor_text)
|
| 430 |
alnum_anchor = _norm_alnum(anchor_text)
|
| 431 |
+
|
| 432 |
+
for match_fn, conf in [
|
| 433 |
+
(lambda c: anchor_text in c, 1.0),
|
| 434 |
+
(lambda c: bool(norm_anchor) and norm_anchor in _norm(c), 0.9),
|
| 435 |
+
(lambda c: bool(ws_anchor) and ws_anchor in _norm_ws(c), 0.8),
|
| 436 |
+
(lambda c: bool(alnum_anchor) and alnum_anchor in _norm_alnum(c), 0.6),
|
| 437 |
+
]:
|
| 438 |
+
candidates = [
|
| 439 |
+
idx for idx, row in enumerate(all_rows)
|
| 440 |
+
if row.cells and match_fn(row.cells[0].text)
|
| 441 |
+
]
|
| 442 |
+
if not candidates:
|
| 443 |
+
continue
|
| 444 |
+
if len(candidates) == 1 or not context_rows_before:
|
| 445 |
+
return candidates[0], conf
|
| 446 |
+
return _disambiguate_by_context(all_rows, candidates, context_rows_before), conf
|
| 447 |
|
| 448 |
# ββ Prefix-based partial match βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
prefix = _clean_prefix(anchor_text)
|
| 450 |
if prefix and len(prefix) > 8:
|
| 451 |
prefix_low = prefix.lower()
|
| 452 |
hits = [
|
| 453 |
+
idx for idx, row in enumerate(all_rows)
|
| 454 |
if row.cells and prefix_low in row.cells[0].text.lower()
|
| 455 |
]
|
| 456 |
if len(hits) == 1:
|
|
|
|
| 460 |
best_score, best_idx = -1, -1
|
| 461 |
for hit_idx in hits:
|
| 462 |
cell_tokens = set(re.findall(r'[a-z0-9]+',
|
| 463 |
+
all_rows[hit_idx].cells[0].text.lower()))
|
| 464 |
score = len(anchor_tokens & cell_tokens)
|
| 465 |
if score > best_score:
|
| 466 |
best_score, best_idx = score, hit_idx
|
|
|
|
| 572 |
loc = change['location']
|
| 573 |
del_heading = loc.get('del_heading', '')
|
| 574 |
has_del_table = loc.get('has_del_table', False)
|
| 575 |
+
section_number = loc.get('section_number', '')
|
| 576 |
elements_xml = change.get('elements_xml', [])
|
| 577 |
|
| 578 |
if not elements_xml:
|
| 579 |
log.append(' SKIP section_replace: no elements in manifest')
|
| 580 |
return False
|
| 581 |
|
| 582 |
+
# ββ Resolve search scope: restrict to declared section if possible βββββββββ
|
| 583 |
+
search_paras = doc.paragraphs
|
| 584 |
+
section_status = 'no_section_required'
|
| 585 |
+
if section_number:
|
| 586 |
+
start, end = _section_range(doc, section_number)
|
| 587 |
+
if start is not None:
|
| 588 |
+
search_paras = doc.paragraphs[start:end]
|
| 589 |
+
section_status = 'in_section'
|
| 590 |
+
else:
|
| 591 |
+
log.append(f" WARN section '{section_number}' not found in TS β falling back to global search")
|
| 592 |
+
section_status = 'section_not_in_ts'
|
| 593 |
+
else:
|
| 594 |
+
log.append(" WARN no section_number on section_replace β global search")
|
| 595 |
+
|
| 596 |
# ββ Find the TS paragraph that matches the deleted heading βββββββββββββββββ
|
| 597 |
ts_para_elem = None
|
| 598 |
+
insert_after_anchor = False # when True: insert after anchor, don't delete it
|
| 599 |
if del_heading:
|
| 600 |
+
for para in search_paras:
|
| 601 |
pt = para.text
|
| 602 |
if del_heading in pt or _norm(del_heading) in _norm(pt):
|
| 603 |
ts_para_elem = para._element
|
| 604 |
break
|
| 605 |
if ts_para_elem is None:
|
| 606 |
# Fallback: include paragraphs whose XML text (inc. del runs) matches
|
| 607 |
+
for para in search_paras:
|
| 608 |
if del_heading in _full_para_text(para):
|
| 609 |
ts_para_elem = para._element
|
| 610 |
break
|
| 611 |
+
else:
|
| 612 |
+
# No heading to delete β use anchor_text to find insertion point
|
| 613 |
+
anchor_text = loc.get('anchor_text', '')
|
| 614 |
+
if anchor_text:
|
| 615 |
+
if section_status == 'in_section':
|
| 616 |
+
anchor_para, _, _ = _find_para_in_section(
|
| 617 |
+
doc, anchor_text, section_number)
|
| 618 |
+
else:
|
| 619 |
+
anchor_para, _ = _find_para(doc, anchor_text)
|
| 620 |
+
if anchor_para is not None:
|
| 621 |
+
ts_para_elem = anchor_para._element
|
| 622 |
+
insert_after_anchor = True
|
| 623 |
|
| 624 |
if ts_para_elem is None:
|
| 625 |
+
# Section mismatch check: if declared section exists, but del_heading
|
| 626 |
+
# is found GLOBALLY in a different section, report that.
|
| 627 |
+
if section_status == 'in_section' and del_heading:
|
| 628 |
+
for para in doc.paragraphs:
|
| 629 |
+
pt = para.text
|
| 630 |
+
if del_heading in pt or del_heading in _full_para_text(para):
|
| 631 |
+
enc = _enclosing_heading(doc, para)
|
| 632 |
+
actual = _para_heading_number(enc) if enc is not None else '?'
|
| 633 |
+
log.append(f' ERROR section_replace: del_heading {del_heading!r} declared in section '
|
| 634 |
+
f'{section_number} but found in section {actual}')
|
| 635 |
+
log.append(f" HINT nearest match: {para.text[:120]!r}")
|
| 636 |
+
return False
|
| 637 |
log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
|
| 638 |
+
tokens = del_heading.split()[:3] if del_heading else []
|
| 639 |
+
if tokens:
|
| 640 |
+
_hints = sorted(
|
| 641 |
+
[p for p in doc.paragraphs if any(tok in p.text for tok in tokens)],
|
| 642 |
+
key=lambda p: -len(set(del_heading.split()) & set(p.text.split()))
|
| 643 |
+
)[:3]
|
| 644 |
+
for _h in _hints:
|
| 645 |
+
log.append(f" HINT nearest match: {_h.text[:120]!r}")
|
| 646 |
return False
|
| 647 |
|
| 648 |
ts_body = ts_para_elem.getparent()
|
|
|
|
| 694 |
el.set(qn('w:id'), rev.next())
|
| 695 |
cloned.append(cloned_elem)
|
| 696 |
|
| 697 |
+
# ββ Insert cloned elements before (or after) the anchor paragraph βββββββββ
|
| 698 |
insert_idx = list(ts_body).index(ts_para_elem)
|
| 699 |
+
if insert_after_anchor:
|
| 700 |
+
insert_idx += 1 # insert after anchor, not before it
|
| 701 |
for i, elem in enumerate(cloned):
|
| 702 |
ts_body.insert(insert_idx + i, elem)
|
| 703 |
|
| 704 |
+
# ββ Remove the now-replaced TS elements (only when a heading was deleted) ββ
|
| 705 |
+
if not insert_after_anchor:
|
| 706 |
+
ts_body.remove(ts_para_elem)
|
| 707 |
if ts_tbl_elem is not None:
|
| 708 |
ts_body.remove(ts_tbl_elem)
|
| 709 |
|
|
|
|
| 802 |
|
| 803 |
elif loc['kind'] == 'body_para':
|
| 804 |
ctx = loc.get('para_context', '')
|
| 805 |
+
section_number = loc.get('section_number', '')
|
| 806 |
+
if len(old) < 4 and ctx:
|
| 807 |
+
# Short old text matches too broadly (e.g. a single digit would hit
|
| 808 |
+
# the title paragraph). Locate by context first, then verify old
|
| 809 |
+
# text is present in that paragraph.
|
| 810 |
+
para, conf = _find_para_with_section(
|
| 811 |
+
doc, ctx, section_number, 'text_replace', log, prefer_not_in_table=True)
|
| 812 |
+
if para is None or old not in para.text:
|
| 813 |
+
para = None
|
| 814 |
+
else:
|
| 815 |
+
para, conf = _find_para_with_section(
|
| 816 |
+
doc, old, section_number, 'text_replace', log, prefer_not_in_table=True)
|
| 817 |
+
if para is None and ctx:
|
| 818 |
+
para, conf = _find_para_with_section(
|
| 819 |
+
doc, ctx, section_number, 'text_replace', log, prefer_not_in_table=True)
|
| 820 |
if para is None:
|
| 821 |
+
log.append(f" ERROR text_replace: old text {old!r} not found in TS")
|
| 822 |
+
return False
|
|
|
|
|
|
|
|
|
|
| 823 |
if old in para.text:
|
| 824 |
tracked_modify_para(para, old, new, rev, author, date)
|
| 825 |
log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β {new!r}")
|
|
|
|
| 832 |
|
| 833 |
|
| 834 |
def _apply_para_insert(doc, change, rev, author, date, log):
|
| 835 |
+
loc = change['location']
|
| 836 |
+
anchor_text = loc.get('anchor_text', '')
|
| 837 |
+
section_number = loc.get('section_number', '')
|
| 838 |
paras_data = change.get('paragraphs', [])
|
| 839 |
if not paras_data:
|
| 840 |
return True
|
| 841 |
|
| 842 |
+
anchor_para, conf = _find_para_with_section(
|
| 843 |
+
doc, anchor_text, section_number, 'para_insert', log)
|
| 844 |
if anchor_para is None:
|
| 845 |
+
# When no section_number context, emit the legacy ERROR + HINT lines
|
| 846 |
+
if not section_number:
|
| 847 |
+
log.append(f" ERROR para_insert: anchor not found {anchor_text[:60]!r}")
|
| 848 |
+
tokens = anchor_text.split()[:3]
|
| 849 |
+
_hints = sorted(
|
| 850 |
+
[p for p in doc.paragraphs if any(tok in p.text for tok in tokens)],
|
| 851 |
+
key=lambda p: -len(set(anchor_text.split()) & set(p.text.split()))
|
| 852 |
+
)[:3]
|
| 853 |
+
for _h in _hints:
|
| 854 |
+
log.append(f" HINT nearest match: {_h.text[:120]!r}")
|
| 855 |
return False
|
| 856 |
|
| 857 |
items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
|
|
|
|
| 877 |
return False
|
| 878 |
|
| 879 |
after_anchor = loc.get('after_row_anchor', '')
|
| 880 |
+
context_rows_before = loc.get('context_rows_before', [])
|
| 881 |
+
row_idx, r_conf = _find_row(tbl, after_anchor, context_rows_before)
|
| 882 |
if row_idx < 0:
|
| 883 |
log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
|
| 884 |
return False
|
|
|
|
| 942 |
def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
|
| 943 |
"""
|
| 944 |
Apply all changes in manifest to ts_path, save to out_path.
|
| 945 |
+
Returns (n_ok, n_skipped, log_lines, n_parsed, n_merged_groups).
|
| 946 |
"""
|
| 947 |
doc = docx.Document(str(ts_path))
|
| 948 |
rev = RevCounter(doc)
|
|
|
|
| 950 |
n_ok = 0
|
| 951 |
n_skip = 0
|
| 952 |
|
| 953 |
+
n_parsed = len(manifest)
|
| 954 |
manifest = _merge_para_inserts(manifest)
|
| 955 |
+
n_merged = len(manifest)
|
| 956 |
|
| 957 |
# Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
|
| 958 |
# forward insertion order when multiple row_inserts target the same anchor.
|
|
|
|
| 979 |
n_skip += 1
|
| 980 |
|
| 981 |
doc.save(str(out_path))
|
| 982 |
+
return n_ok, n_skip, log, n_parsed, n_merged
|
| 983 |
|
| 984 |
|
| 985 |
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 999 |
manifest = json.load(f)
|
| 1000 |
|
| 1001 |
print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
|
| 1002 |
+
n_ok, n_skip, log, n_parsed, n_merged = apply_manifest(ts_path, manifest, out_path, author=args.author)
|
| 1003 |
|
| 1004 |
for line in log:
|
| 1005 |
print(line)
|
| 1006 |
+
print(f'\nParsed: {n_parsed} body changes (merged to {n_merged} groups) β Applied: {n_ok} Skipped: {n_skip}')
|
| 1007 |
print(f'Output: {out_path}')
|
| 1008 |
|
| 1009 |
|