heymenn commited on
Commit
2b96123
Β·
1 Parent(s): b5fc740

fix minor errors

Browse files
scripts/cr_parser.py CHANGED
@@ -57,6 +57,46 @@ def _style_val(p_elem):
57
  return None
58
  return pStyle.get(qn('w:val'))
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  def _is_rpr_ins(ins_elem):
61
  """True if w:ins is inside w:rPr β€” a formatting change, not a content insertion."""
62
  p = ins_elem.getparent()
@@ -185,7 +225,7 @@ def _extract_inline_replacements(p_elem):
185
 
186
  # ── Table change extraction ───────────────────────────────────────────────────
187
 
188
- def _parse_table(tbl_elem, changes, section_heading=''):
189
  header = _table_header(tbl_elem)
190
  header_key = header[:3] # first 3 columns enough for matching
191
  rows = tbl_elem.findall(qn('w:tr'))
@@ -195,14 +235,18 @@ def _parse_table(tbl_elem, changes, section_heading=''):
195
 
196
  # ── Tracked row insertion ─────────────────────────────────────────
197
  if trPr is not None and trPr.find(qn('w:ins')) is not None:
198
- # Find preceding stable row for anchor
199
- after_anchor = ''
200
  for prev_idx in range(tr_idx - 1, -1, -1):
201
  prev_tr = rows[prev_idx]
202
  prev_trPr = prev_tr.find(qn('w:trPr'))
203
  if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
204
- after_anchor = _row_col0(prev_tr)
205
- break
 
 
 
 
206
 
207
  cells = []
208
  for tc in tr.findall(qn('w:tc')):
@@ -244,7 +288,9 @@ def _parse_table(tbl_elem, changes, section_heading=''):
244
  'kind': 'table_row',
245
  'table_header': header_key,
246
  'after_row_anchor': after_anchor,
 
247
  'section_heading': section_heading,
 
248
  },
249
  'cells': cells,
250
  })
@@ -266,6 +312,7 @@ def _parse_table(tbl_elem, changes, section_heading=''):
266
  'row_anchor': row_anchor,
267
  'col_idx': col_idx,
268
  'section_heading': section_heading,
 
269
  },
270
  'old': old_text,
271
  'new': new_text,
@@ -292,6 +339,7 @@ def _parse_body(body, changes):
292
  from lxml import etree
293
 
294
  prev_stable_text = ''
 
295
 
296
  # ── Section-replace accumulator ───────────────────────────────────────────
297
  sec_del = [] # fully-deleted elements (CR del block)
@@ -340,6 +388,7 @@ def _parse_body(body, changes):
340
  'del_heading': del_heading,
341
  'has_del_table': has_del_table,
342
  'anchor_text': sec_anchor,
 
343
  },
344
  'elements_xml': elements_xml,
345
  })
@@ -371,6 +420,7 @@ def _parse_body(body, changes):
371
  'location': {
372
  'kind': 'body',
373
  'anchor_text': prev_stable_text,
 
374
  },
375
  'paragraphs': paras,
376
  })
@@ -380,6 +430,10 @@ def _parse_body(body, changes):
380
  tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
381
 
382
  if tag == 'p':
 
 
 
 
383
  is_del = _is_deleted_para(elem)
384
  is_ins = _is_inserted_para(elem)
385
  is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
@@ -430,13 +484,14 @@ def _parse_body(body, changes):
430
  'location': {
431
  'kind': 'body_para',
432
  'para_context': _para_orig_text(elem).strip(),
 
433
  },
434
  'old': old_text,
435
  'new': new_text,
436
  })
437
 
438
  orig = _para_orig_text(elem).strip()
439
- if orig and not re.fullmatch(r'\[\.[\s\.]*\]', orig):
440
  prev_stable_text = orig
441
 
442
  elif tag == 'tbl':
@@ -464,7 +519,8 @@ def _parse_body(body, changes):
464
  # Table with inline cell changes
465
  flush_section()
466
  flush_group()
467
- _parse_table(elem, changes, section_heading=prev_stable_text)
 
468
 
469
  flush_section()
470
  flush_group()
 
57
  return None
58
  return pStyle.get(qn('w:val'))
59
 
60
+
61
+ _HEADING_NUM_RE = re.compile(r'^(\d+(?:\.\d+)*)\s+\S')
62
+ _SKIP_MARKER_RE = re.compile(r'^[\[\(]?\s*(?:\.{3}|…)\s*[\]\)]?$')
63
+
64
+
65
+ def _para_text_with_tabs(p_elem):
66
+ """Paragraph text with w:tab elements rendered as '\\t'.
67
+ Used for heading detection since ETSI headings store the number and title in
68
+ separate runs separated by <w:tab/>, which _para_orig_text would drop."""
69
+ parts = []
70
+ for node in p_elem.iter():
71
+ if node.tag == qn('w:t') and node.text:
72
+ if not any(a.tag == qn('w:ins') for a in node.iterancestors()):
73
+ parts.append(node.text)
74
+ elif node.tag == qn('w:delText') and node.text:
75
+ parts.append(node.text)
76
+ elif node.tag == qn('w:tab'):
77
+ parts.append('\t')
78
+ return ''.join(parts)
79
+
80
+
81
+ def _heading_number(p_elem):
82
+ """Return dotted section number if this paragraph is a numbered heading, else None.
83
+ Requires the paragraph style to start with 'Heading' (case-insensitive) β€” this
84
+ prevents false positives from body paragraphs whose text starts with a digit,
85
+ notably bit-description lines like "1 = alphabet set." (style B30) that appear
86
+ in Terminal Profile sections."""
87
+ style = (_style_val(p_elem) or '').lower()
88
+ if not style.startswith('heading'):
89
+ return None
90
+ text = _para_text_with_tabs(p_elem).strip()
91
+ m = _HEADING_NUM_RE.match(text)
92
+ return m.group(1) if m else None
93
+
94
+
95
+ def _is_skip_marker(text):
96
+ """True for [...] / […] / ... / … / (...) / (…) after .strip()."""
97
+ return bool(_SKIP_MARKER_RE.match(text.strip()))
98
+
99
+
100
  def _is_rpr_ins(ins_elem):
101
  """True if w:ins is inside w:rPr β€” a formatting change, not a content insertion."""
102
  p = ins_elem.getparent()
 
225
 
226
  # ── Table change extraction ───────────────────────────────────────────────────
227
 
228
+ def _parse_table(tbl_elem, changes, section_heading='', section_number=''):
229
  header = _table_header(tbl_elem)
230
  header_key = header[:3] # first 3 columns enough for matching
231
  rows = tbl_elem.findall(qn('w:tr'))
 
235
 
236
  # ── Tracked row insertion ─────────────────────────────────────────
237
  if trPr is not None and trPr.find(qn('w:ins')) is not None:
238
+ # Find preceding stable rows for anchor + context disambiguation
239
+ stable_before = []
240
  for prev_idx in range(tr_idx - 1, -1, -1):
241
  prev_tr = rows[prev_idx]
242
  prev_trPr = prev_tr.find(qn('w:trPr'))
243
  if prev_trPr is None or prev_trPr.find(qn('w:ins')) is None:
244
+ stable_before.append(_row_col0(prev_tr))
245
+ if len(stable_before) >= 3:
246
+ break
247
+
248
+ after_anchor = stable_before[0] if stable_before else ''
249
+ context_rows_before = stable_before[1:]
250
 
251
  cells = []
252
  for tc in tr.findall(qn('w:tc')):
 
288
  'kind': 'table_row',
289
  'table_header': header_key,
290
  'after_row_anchor': after_anchor,
291
+ 'context_rows_before': context_rows_before,
292
  'section_heading': section_heading,
293
+ 'section_number': section_number,
294
  },
295
  'cells': cells,
296
  })
 
312
  'row_anchor': row_anchor,
313
  'col_idx': col_idx,
314
  'section_heading': section_heading,
315
+ 'section_number': section_number,
316
  },
317
  'old': old_text,
318
  'new': new_text,
 
339
  from lxml import etree
340
 
341
  prev_stable_text = ''
342
+ current_section = ''
343
 
344
  # ── Section-replace accumulator ───────────────────────────────────────────
345
  sec_del = [] # fully-deleted elements (CR del block)
 
388
  'del_heading': del_heading,
389
  'has_del_table': has_del_table,
390
  'anchor_text': sec_anchor,
391
+ 'section_number': current_section,
392
  },
393
  'elements_xml': elements_xml,
394
  })
 
420
  'location': {
421
  'kind': 'body',
422
  'anchor_text': prev_stable_text,
423
+ 'section_number': current_section,
424
  },
425
  'paragraphs': paras,
426
  })
 
430
  tag = elem.tag.split('}')[-1] if '}' in elem.tag else elem.tag
431
 
432
  if tag == 'p':
433
+ hn = _heading_number(elem)
434
+ if hn:
435
+ current_section = hn
436
+
437
  is_del = _is_deleted_para(elem)
438
  is_ins = _is_inserted_para(elem)
439
  is_empty = not _para_orig_text(elem).strip() and not _para_new_text(elem).strip()
 
484
  'location': {
485
  'kind': 'body_para',
486
  'para_context': _para_orig_text(elem).strip(),
487
+ 'section_number': current_section,
488
  },
489
  'old': old_text,
490
  'new': new_text,
491
  })
492
 
493
  orig = _para_orig_text(elem).strip()
494
+ if orig and not _is_skip_marker(orig):
495
  prev_stable_text = orig
496
 
497
  elif tag == 'tbl':
 
519
  # Table with inline cell changes
520
  flush_section()
521
  flush_group()
522
+ _parse_table(elem, changes, section_heading=prev_stable_text,
523
+ section_number=current_section)
524
 
525
  flush_section()
526
  flush_group()
scripts/etsi_client.py CHANGED
@@ -2,8 +2,8 @@
2
  etsi_client.py β€” ETSI document download helpers for ApplyCRs.
3
 
4
  Provides:
5
- ETSIDocFinder β€” CR TDoc downloads via docbox.etsi.org
6
- ETSISpecFinder β€” TS DOCX downloads via portal.etsi.org WKI chain
7
  """
8
 
9
  import json
@@ -27,7 +27,7 @@ def _get_proxies() -> dict:
27
  return {"http": proxy, "https": proxy}
28
 
29
 
30
- class ETSIDocFinder:
31
  HEADERS = {
32
  "User-Agent": (
33
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -156,7 +156,7 @@ class ETSIDocFinder:
156
  )
157
 
158
 
159
- class ETSISpecFinder:
160
  def __init__(self, eol_user: str, eol_password: str):
161
  self.eol_user = eol_user
162
  self.eol_password = eol_password
@@ -458,11 +458,26 @@ class ETSISpecFinder:
458
 
459
  if not versioned_urls:
460
  found_names = [u.split("/")[-1] for u in matching_urls]
461
- print(
462
- f" wki_id={wki_id}: version tag not in filenames {found_names}, "
463
- f"using first spec-matching DOCX as fallback"
464
- )
465
- versioned_urls = matching_urls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
  matching_urls = versioned_urls
468
 
 
2
  etsi_client.py β€” ETSI document download helpers for ApplyCRs.
3
 
4
  Provides:
5
+ CRFetcher β€” CR TDoc downloads via docbox.etsi.org
6
+ TSFetcher β€” TS DOCX downloads via portal.etsi.org WKI chain
7
  """
8
 
9
  import json
 
27
  return {"http": proxy, "https": proxy}
28
 
29
 
30
+ class CRFetcher:
31
  HEADERS = {
32
  "User-Agent": (
33
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 
156
  )
157
 
158
 
159
+ class TSFetcher:
160
  def __init__(self, eol_user: str, eol_password: str):
161
  self.eol_user = eol_user
162
  self.eol_password = eol_password
 
458
 
459
  if not versioned_urls:
460
  found_names = [u.split("/")[-1] for u in matching_urls]
461
+ # Decode the available version from the first filename (e.g. v160500 β†’ 16.5.0)
462
+ avail_ver = None
463
+ if found_names:
464
+ m = re.search(r'v(\d{6})p?', found_names[0])
465
+ if m:
466
+ t = m.group(1)
467
+ avail_ver = f"{int(t[0:2])}.{int(t[2:4])}.{int(t[4:6])}"
468
+ if avail_ver:
469
+ print(
470
+ f"\n *** WARNING ***\n"
471
+ f" TS {doc_id} v{version_str} is not available on the ETSI portal.\n"
472
+ f" Portal has v{avail_ver} (file: {found_names[0]}).\n"
473
+ f" Options: target v{avail_ver} in your CR, or drop the TS DOCX manually.\n"
474
+ )
475
+ else:
476
+ print(
477
+ f" wki_id={wki_id}: version tag not in filenames {found_names}, "
478
+ f"rejecting (wrong version would be downloaded)"
479
+ )
480
+ return None
481
 
482
  matching_urls = versioned_urls
483
 
scripts/fetch_crs.py CHANGED
@@ -20,7 +20,7 @@ import sys
20
  import zipfile
21
  from pathlib import Path
22
 
23
- from etsi_client import ETSIDocFinder, ETSISpecFinder
24
 
25
 
26
  # ---------------------------------------------------------------------------
@@ -218,10 +218,13 @@ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
218
  dest = cr_dir / f"{uid}.docx"
219
 
220
  if dest.exists():
 
 
 
221
  return dest, "already existed"
222
 
223
  try:
224
- finder = ETSIDocFinder(eol_user, eol_password)
225
  url = finder.search_document(uid)
226
  if isinstance(url, str) and "not found" in url.lower():
227
  return None, f"document not found: {uid}"
@@ -373,7 +376,7 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
373
  return filename, "already existed"
374
 
375
  try:
376
- finder = ETSISpecFinder(eol_user, eol_password)
377
  tmp_path = finder.search_document_docx(spec_number, version)
378
  except Exception as e:
379
  return None, f"download error: {e}"
@@ -391,7 +394,7 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
391
  dest.unlink()
392
  return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
393
 
394
- # Verify the TS contains the expected spec number in its first paragraph
395
  try:
396
  import docx as _docx
397
  _doc = _docx.Document(dest)
@@ -402,6 +405,12 @@ def download_ts(spec_number: str, version: str, ts_dir: Path,
402
  f"wrong TS returned: got {first_para[:80]!r} "
403
  f"(expected spec {spec_no_space})"
404
  )
 
 
 
 
 
 
405
  except Exception:
406
  pass # Trust the ZIP check above
407
 
 
20
  import zipfile
21
  from pathlib import Path
22
 
23
+ from etsi_client import CRFetcher, TSFetcher
24
 
25
 
26
  # ---------------------------------------------------------------------------
 
218
  dest = cr_dir / f"{uid}.docx"
219
 
220
  if dest.exists():
221
+ extracted = cr_dir / f"{uid}_extracted.docx"
222
+ if extracted.exists():
223
+ return extracted, "already existed"
224
  return dest, "already existed"
225
 
226
  try:
227
+ finder = CRFetcher(eol_user, eol_password)
228
  url = finder.search_document(uid)
229
  if isinstance(url, str) and "not found" in url.lower():
230
  return None, f"document not found: {uid}"
 
376
  return filename, "already existed"
377
 
378
  try:
379
+ finder = TSFetcher(eol_user, eol_password)
380
  tmp_path = finder.search_document_docx(spec_number, version)
381
  except Exception as e:
382
  return None, f"download error: {e}"
 
394
  dest.unlink()
395
  return None, f"invalid file (not a ZIP/DOCX, starts with {content[:4]!r})"
396
 
397
+ # Verify the TS contains the expected spec number AND version in its first paragraph
398
  try:
399
  import docx as _docx
400
  _doc = _docx.Document(dest)
 
405
  f"wrong TS returned: got {first_para[:80]!r} "
406
  f"(expected spec {spec_no_space})"
407
  )
408
+ if f"V{version}" not in first_para:
409
+ dest.unlink()
410
+ return None, (
411
+ f"wrong version returned: got {first_para[:80]!r} "
412
+ f"(expected V{version})"
413
+ )
414
  except Exception:
415
  pass # Trust the ZIP check above
416
 
scripts/finalize_ts.py CHANGED
@@ -171,11 +171,12 @@ def _detect_meeting_separator(tbl):
171
  Returns the detected separator character, defaulting to '#'.
172
  """
173
  meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
174
- if tbl.rows:
175
- for c_idx, cell in enumerate(tbl.rows[0].cells):
176
- if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
177
- meet_col = c_idx
178
- break
 
179
  for row in reversed(tbl.rows):
180
  cells = row.cells
181
  if len(cells) > meet_col:
@@ -196,25 +197,21 @@ class NoChangeHistoryTable(Exception):
196
 
197
  def find_change_history_table(ts_doc):
198
  """
199
- Scan all tables backward from the end looking for a Change History table.
200
- A match requires both:
201
- - 8 or 9 columns in the last row (standard ETSI Change History layout)
202
- - At least one of the keywords 'cr', 'date', 'meeting', 'rev' in the header row
203
- Raises NoChangeHistoryTable (not ValueError) when none is found so callers
204
- can distinguish a structural absence from an unexpected error.
205
  """
206
- for tbl in reversed(ts_doc.tables):
207
- ncols = len(tbl.rows[-1].cells)
208
- if ncols not in (8, 9):
209
  continue
210
- if tbl.rows:
211
- header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
212
- header_words = set(re.findall(r'\b\w+\b', header_text))
213
- if {'cr', 'date'}.issubset(header_words):
214
- return tbl
215
  raise NoChangeHistoryTable(
216
  'No Change History table found in this document '
217
- '(no table with 8 or 9 columns and CR/Date/Meeting/Rev headers)'
218
  )
219
 
220
 
@@ -233,7 +230,9 @@ def find_history_table(ts_doc):
233
 
234
  def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
235
  tbl = find_change_history_table(ts_doc)
236
- ncols = len(tbl.rows[-1].cells)
 
 
237
 
238
  # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
239
  # and reformat meeting_id accordingly so it matches the existing style.
 
171
  Returns the detected separator character, defaulting to '#'.
172
  """
173
  meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
174
+ # row[0] is the "Change history" title; row[1] is the column header row
175
+ header_row = tbl.rows[1] if len(tbl.rows) > 1 else tbl.rows[0]
176
+ for c_idx, cell in enumerate(header_row.cells):
177
+ if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
178
+ meet_col = c_idx
179
+ break
180
  for row in reversed(tbl.rows):
181
  cells = row.cells
182
  if len(cells) > meet_col:
 
197
 
198
  def find_change_history_table(ts_doc):
199
  """
200
+ Find the Change History table by looking for a first row whose text
201
+ contains "Change history" (the merged title cell that ETSI places at the
202
+ top of the annex table).
203
+
204
+ Raises NoChangeHistoryTable when no such table is found.
 
205
  """
206
+ for tbl in ts_doc.tables:
207
+ if not tbl.rows:
 
208
  continue
209
+ r0_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells)
210
+ if 'Change history' in r0_text:
211
+ return tbl
 
 
212
  raise NoChangeHistoryTable(
213
  'No Change History table found in this document '
214
+ '(no table whose first row contains "Change history")'
215
  )
216
 
217
 
 
230
 
231
  def update_change_history_table(ts_doc, meta, pub_yyyy_mm, old_v, new_v, rev, author, date_str):
232
  tbl = find_change_history_table(ts_doc)
233
+ # row[0] is the "Change history" title (merged); row[1] is the column header row
234
+ header_row = tbl.rows[1] if len(tbl.rows) > 1 else tbl.rows[0]
235
+ ncols = len(header_row.cells)
236
 
237
  # Detect separator used in existing rows (e.g. '#' in 'SET#115', '-' in 'SET-119')
238
  # and reformat meeting_id accordingly so it matches the existing style.
scripts/orchestrate_cr.py CHANGED
@@ -169,7 +169,7 @@ def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
169
 
170
  print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
171
  try:
172
- n_ok, n_skip, log_lines = apply_manifest(
173
  ts_in, combined_manifest, ts_applied, author=author, date=tc_date
174
  )
175
  except Exception as e:
@@ -183,7 +183,8 @@ def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
183
  for line in log_lines:
184
  if line.strip().startswith('ERROR'):
185
  errors.append(line.strip())
186
- print(f' -> Applied: {n_ok} Skipped: {n_skip}')
 
187
 
188
  print(' Finalising metadata...')
189
  ts_final_or_applied = ts_applied # fallback if finalise raises
 
169
 
170
  print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
171
  try:
172
+ n_ok, n_skip, log_lines, n_parsed, n_merged = apply_manifest(
173
  ts_in, combined_manifest, ts_applied, author=author, date=tc_date
174
  )
175
  except Exception as e:
 
183
  for line in log_lines:
184
  if line.strip().startswith('ERROR'):
185
  errors.append(line.strip())
186
+ print(f' Parsed: {n_parsed} body changes (merged to {n_merged} groups)'
187
+ f' β†’ Applied: {n_ok} Skipped: {n_skip}')
188
 
189
  print(' Finalising metadata...')
190
  ts_final_or_applied = ts_applied # fallback if finalise raises
scripts/ts_applicator.py CHANGED
@@ -51,6 +51,7 @@ _UNICODE_REPLACEMENTS = (
51
  ('\u2019', "'"), # right single quote
52
  ('\u201c', '"'), # left double quote
53
  ('\u201d', '"'), # right double quote
 
54
  )
55
 
56
 
@@ -123,26 +124,50 @@ def _full_para_text(para):
123
  ''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
124
 
125
 
126
- def _find_para(doc, search_text, prefer_not_in_table=False):
127
- """
128
- Find the first paragraph containing search_text.
129
- Four levels of matching, in order of confidence:
130
- 1.0 β€” exact substring match
131
- 0.9 β€” NBSP/dash-normalised match (_norm)
132
- 0.8 β€” whitespace-stripped match (_norm_ws) handles tab vs nothing in
133
- structured paragraphs (refs '[27]\\t...', abbrevs 'CLT\\t...', headings '8.3\\t...')
134
- 0.6 β€” full XML text (including w:del content): handles anchors that were
135
- previously deleted by tracked_modify_para in an earlier apply step
136
- Returns (para, confidence) or (None, 0.0).
 
137
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  norm_search = _norm(search_text)
139
  ws_search = _norm_ws(search_text)
140
  candidates_exact = []
141
  candidates_norm = []
142
  candidates_ws = []
 
143
  candidates_del = []
144
 
145
- for para in doc.paragraphs:
146
  pt = para.text
147
  if search_text in pt:
148
  candidates_exact.append(para)
@@ -151,28 +176,157 @@ def _find_para(doc, search_text, prefer_not_in_table=False):
151
  elif ws_search and ws_search in _norm_ws(pt):
152
  candidates_ws.append(para)
153
  else:
154
- # Level 4: check full XML text (catches deleted-but-still-present paragraphs)
155
- full_pt = _full_para_text(para)
156
- if search_text in full_pt:
157
- candidates_del.append(para)
158
- elif ws_search and ws_search in _norm_ws(full_pt):
159
- candidates_del.append(para)
 
 
 
 
 
 
160
 
161
  def _in_table(para):
162
  p = para._element
163
  return any(a.tag == qn('w:tc') for a in p.iterancestors())
164
 
 
 
 
 
 
 
 
 
 
165
  for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
166
- (candidates_ws, 0.8), (candidates_del, 0.6)]:
 
167
  if not pool:
168
  continue
169
- if prefer_not_in_table:
170
- body_only = [p for p in pool if not _in_table(p)]
171
- if body_only:
172
- return body_only[0], conf
173
- return pool[0], conf
 
174
 
175
- return None, 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
 
178
  def _find_table_by_section(doc, section_heading):
@@ -233,11 +387,33 @@ def _find_table(doc, header_key):
233
  return None, 0.0
234
 
235
 
236
- def _find_row(tbl, anchor_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  """
238
  Find first row in tbl where col-0 cell text contains anchor_text.
239
  Returns (row_idx, confidence) or (-1, 0.0).
240
 
 
 
 
241
  Matching levels, in order of confidence:
242
  1.0 β€” exact substring match
243
  0.9 β€” Unicode-normalised match (_norm: xa0, dashes, quotes, …)
@@ -248,38 +424,33 @@ def _find_row(tbl, anchor_text):
248
  0.5 β€” clean-prefix + token-overlap: when multiple rows share the prefix,
249
  pick the one whose col-0 tokens overlap most with the anchor tokens.
250
  """
 
251
  norm_anchor = _norm(anchor_text)
252
  ws_anchor = _norm_ws(anchor_text)
253
  alnum_anchor = _norm_alnum(anchor_text)
254
- best = (-1, 0.0)
255
-
256
- for idx, row in enumerate(tbl.rows):
257
- cell0 = row.cells[0].text if row.cells else ''
258
- if anchor_text in cell0:
259
- return idx, 1.0
260
- if norm_anchor and norm_anchor in _norm(cell0) and best[1] < 0.9:
261
- best = (idx, 0.9)
262
- elif ws_anchor and ws_anchor in _norm_ws(cell0) and best[1] < 0.8:
263
- best = (idx, 0.8)
264
- elif alnum_anchor and alnum_anchor in _norm_alnum(cell0) and best[1] < 0.6:
265
- best = (idx, 0.6)
266
-
267
- if best[0] >= 0:
268
- return best
 
269
 
270
  # ── Prefix-based partial match ─────────────────────────────────────────────
271
- # The anchor may have Unicode chars embedded mid-text that prevent all string
272
- # comparisons above from matching, even after normalisation (e.g. when the CR
273
- # extracts '\xa0' between spec-number parts but the TS has different encoding).
274
- # Strategy: use only the clean ASCII prefix of the anchor as the search key.
275
- # If that prefix is found in exactly one row β†’ we've uniquely identified it.
276
- # If it appears in several rows β†’ pick the one whose full token set overlaps
277
- # most with the anchor's tokens (the user's described disambiguation rule).
278
  prefix = _clean_prefix(anchor_text)
279
  if prefix and len(prefix) > 8:
280
  prefix_low = prefix.lower()
281
  hits = [
282
- idx for idx, row in enumerate(tbl.rows)
283
  if row.cells and prefix_low in row.cells[0].text.lower()
284
  ]
285
  if len(hits) == 1:
@@ -289,7 +460,7 @@ def _find_row(tbl, anchor_text):
289
  best_score, best_idx = -1, -1
290
  for hit_idx in hits:
291
  cell_tokens = set(re.findall(r'[a-z0-9]+',
292
- tbl.rows[hit_idx].cells[0].text.lower()))
293
  score = len(anchor_tokens & cell_tokens)
294
  if score > best_score:
295
  best_score, best_idx = score, hit_idx
@@ -401,29 +572,77 @@ def _apply_section_replace(doc, change, rev, author, date, log):
401
  loc = change['location']
402
  del_heading = loc.get('del_heading', '')
403
  has_del_table = loc.get('has_del_table', False)
 
404
  elements_xml = change.get('elements_xml', [])
405
 
406
  if not elements_xml:
407
  log.append(' SKIP section_replace: no elements in manifest')
408
  return False
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  # ── Find the TS paragraph that matches the deleted heading ─────────────────
411
  ts_para_elem = None
 
412
  if del_heading:
413
- for para in doc.paragraphs:
414
  pt = para.text
415
  if del_heading in pt or _norm(del_heading) in _norm(pt):
416
  ts_para_elem = para._element
417
  break
418
  if ts_para_elem is None:
419
  # Fallback: include paragraphs whose XML text (inc. del runs) matches
420
- for para in doc.paragraphs:
421
  if del_heading in _full_para_text(para):
422
  ts_para_elem = para._element
423
  break
 
 
 
 
 
 
 
 
 
 
 
 
424
 
425
  if ts_para_elem is None:
 
 
 
 
 
 
 
 
 
 
 
 
426
  log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
 
 
 
 
 
 
 
 
427
  return False
428
 
429
  ts_body = ts_para_elem.getparent()
@@ -475,13 +694,16 @@ def _apply_section_replace(doc, change, rev, author, date, log):
475
  el.set(qn('w:id'), rev.next())
476
  cloned.append(cloned_elem)
477
 
478
- # ── Insert cloned elements before the TS heading paragraph ────────────────
479
  insert_idx = list(ts_body).index(ts_para_elem)
 
 
480
  for i, elem in enumerate(cloned):
481
  ts_body.insert(insert_idx + i, elem)
482
 
483
- # ── Remove the now-replaced TS elements ───────────────────────────────────
484
- ts_body.remove(ts_para_elem)
 
485
  if ts_tbl_elem is not None:
486
  ts_body.remove(ts_tbl_elem)
487
 
@@ -580,14 +802,24 @@ def _apply_text_replace(doc, change, rev, author, date, log):
580
 
581
  elif loc['kind'] == 'body_para':
582
  ctx = loc.get('para_context', '')
583
- # Try to find the paragraph by old text first
584
- para, conf = _find_para(doc, old, prefer_not_in_table=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
585
  if para is None:
586
- # Fall back: find by paragraph context
587
- para, conf = _find_para(doc, ctx, prefer_not_in_table=True)
588
- if para is None:
589
- log.append(f" ERROR text_replace: old text {old!r} not found in TS")
590
- return False
591
  if old in para.text:
592
  tracked_modify_para(para, old, new, rev, author, date)
593
  log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β†’ {new!r}")
@@ -600,14 +832,26 @@ def _apply_text_replace(doc, change, rev, author, date, log):
600
 
601
 
602
  def _apply_para_insert(doc, change, rev, author, date, log):
603
- anchor_text = change['location'].get('anchor_text', '')
 
 
604
  paras_data = change.get('paragraphs', [])
605
  if not paras_data:
606
  return True
607
 
608
- anchor_para, conf = _find_para(doc, anchor_text)
 
609
  if anchor_para is None:
610
- log.append(f" ERROR para_insert: anchor not found {anchor_text[:60]!r}")
 
 
 
 
 
 
 
 
 
611
  return False
612
 
613
  items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
@@ -633,7 +877,8 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
633
  return False
634
 
635
  after_anchor = loc.get('after_row_anchor', '')
636
- row_idx, r_conf = _find_row(tbl, after_anchor)
 
637
  if row_idx < 0:
638
  log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
639
  return False
@@ -697,7 +942,7 @@ def _merge_para_inserts(manifest):
697
  def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
698
  """
699
  Apply all changes in manifest to ts_path, save to out_path.
700
- Returns (n_ok, n_skipped, log_lines).
701
  """
702
  doc = docx.Document(str(ts_path))
703
  rev = RevCounter(doc)
@@ -705,7 +950,9 @@ def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFA
705
  n_ok = 0
706
  n_skip = 0
707
 
 
708
  manifest = _merge_para_inserts(manifest)
 
709
 
710
  # Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
711
  # forward insertion order when multiple row_inserts target the same anchor.
@@ -732,7 +979,7 @@ def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFA
732
  n_skip += 1
733
 
734
  doc.save(str(out_path))
735
- return n_ok, n_skip, log
736
 
737
 
738
  # ── CLI ───────────────────────────────────────────────────────────────────────
@@ -752,11 +999,11 @@ def main():
752
  manifest = json.load(f)
753
 
754
  print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
755
- n_ok, n_skip, log = apply_manifest(ts_path, manifest, out_path, author=args.author)
756
 
757
  for line in log:
758
  print(line)
759
- print(f'\nResult: {n_ok} applied, {n_skip} skipped')
760
  print(f'Output: {out_path}')
761
 
762
 
 
51
  ('\u2019', "'"), # right single quote
52
  ('\u201c', '"'), # left double quote
53
  ('\u201d', '"'), # right double quote
54
+ ('\u2026', '...'), # horizontal ellipsis β†’ three dots
55
  )
56
 
57
 
 
124
  ''.join(t.text or '' for t in el.findall('.//' + qn('w:delText')))
125
 
126
 
127
+ def _original_para_text(para):
128
+ """Reconstruct paragraph text as it was before tracked changes.
129
+
130
+ Iterates in document order, keeping:
131
+ - w:t runs that are NOT inside a w:ins element (stable text)
132
+ - w:delText runs (deleted-but-original text)
133
+ Skipping:
134
+ - w:t runs inside w:ins (newly inserted text)
135
+
136
+ This allows anchors that reference original phrasing (e.g. 'SCP81Connection')
137
+ to still match after a tracked '1'β†’'X' replacement has been applied to that
138
+ paragraph β€” where _full_para_text would return the concatenation out of order.
139
  """
140
+ el = para._element
141
+ result = []
142
+ for node in el.iter():
143
+ if node.tag == qn('w:t'):
144
+ # Skip if this w:t is wrapped in a w:ins element
145
+ is_inserted = False
146
+ for anc in node.iterancestors():
147
+ if anc is el:
148
+ break
149
+ if anc.tag == qn('w:ins'):
150
+ is_inserted = True
151
+ break
152
+ if not is_inserted:
153
+ result.append(node.text or '')
154
+ elif node.tag == qn('w:delText'):
155
+ result.append(node.text or '')
156
+ return ''.join(result)
157
+
158
+
159
+ def _match_paragraphs(paragraphs, search_text, prefer_not_in_table=False):
160
+ """Core 5-tier matching logic. Operates on any iterable of Paragraph objects.
161
+ Returns (para, confidence) or (None, 0.0)."""
162
  norm_search = _norm(search_text)
163
  ws_search = _norm_ws(search_text)
164
  candidates_exact = []
165
  candidates_norm = []
166
  candidates_ws = []
167
+ candidates_orig = []
168
  candidates_del = []
169
 
170
+ for para in paragraphs:
171
  pt = para.text
172
  if search_text in pt:
173
  candidates_exact.append(para)
 
176
  elif ws_search and ws_search in _norm_ws(pt):
177
  candidates_ws.append(para)
178
  else:
179
+ orig_pt = _original_para_text(para)
180
+ if (search_text in orig_pt
181
+ or (norm_search and norm_search in _norm(orig_pt))):
182
+ candidates_orig.append(para)
183
+ elif ws_search and ws_search in _norm_ws(orig_pt):
184
+ candidates_orig.append(para)
185
+ else:
186
+ full_pt = _full_para_text(para)
187
+ if search_text in full_pt:
188
+ candidates_del.append(para)
189
+ elif ws_search and ws_search in _norm_ws(full_pt):
190
+ candidates_del.append(para)
191
 
192
  def _in_table(para):
193
  p = para._element
194
  return any(a.tag == qn('w:tc') for a in p.iterancestors())
195
 
196
+ if not prefer_not_in_table:
197
+ for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
198
+ (candidates_ws, 0.8), (candidates_orig, 0.7),
199
+ (candidates_del, 0.6)]:
200
+ if pool:
201
+ return pool[0], conf
202
+ return None, 0.0
203
+
204
+ best_table_match = (None, 0.0)
205
  for pool, conf in [(candidates_exact, 1.0), (candidates_norm, 0.9),
206
+ (candidates_ws, 0.8), (candidates_orig, 0.7),
207
+ (candidates_del, 0.6)]:
208
  if not pool:
209
  continue
210
+ body_only = [p for p in pool if not _in_table(p)]
211
+ if body_only:
212
+ return body_only[0], conf
213
+ if best_table_match[0] is None:
214
+ best_table_match = (pool[0], conf)
215
+ return best_table_match if best_table_match[0] is not None else (None, 0.0)
216
 
217
+
218
+ def _find_para(doc, search_text, prefer_not_in_table=False):
219
+ """Find the first paragraph containing search_text across the entire doc.
220
+ Five-tier matching (see _match_paragraphs). Returns (para, confidence)."""
221
+ return _match_paragraphs(doc.paragraphs, search_text, prefer_not_in_table)
222
+
223
+
224
+ # ── Section-aware anchor search ───────────────────────────────────────────────
225
+
226
+ _HEADING_NUM_RE = re.compile(r'^(\d+(?:\.\d+)*)\s+\S')
227
+
228
+
229
+ def _para_heading_number(para):
230
+ """Dotted section number if this paragraph is a real TS heading, else None.
231
+ Requires the paragraph style to start with 'Heading' (case-insensitive) β€” this
232
+ rejects false positives from TOC entries (style 'toc N'), address lines in the
233
+ front matter (style 'FP'), change history labels (style 'B3'), etc. ETSI/3GPP
234
+ TS documents always style real headings as 'Heading 1'..'Heading N'."""
235
+ style_name = (para.style.name if para.style is not None else '') or ''
236
+ if not style_name.lower().startswith('heading'):
237
+ return None
238
+ m = _HEADING_NUM_RE.match(para.text.strip())
239
+ return m.group(1) if m else None
240
+
241
+
242
+ def _is_descendant_section(child, parent):
243
+ """True if `child` is `parent` or nested under it (by dotted-prefix)."""
244
+ return child == parent or child.startswith(parent + '.')
245
+
246
+
247
+ def _section_range(doc, target):
248
+ """Return (start_idx, end_idx) into doc.paragraphs spanning the target section.
249
+ start = index of the heading whose number == target.
250
+ end = index of the next heading whose number is NOT a descendant of target
251
+ (or len(doc.paragraphs) if none).
252
+ Returns (None, None) if target heading not found. Recomputed per-call."""
253
+ paras = doc.paragraphs
254
+ start = None
255
+ for i, p in enumerate(paras):
256
+ n = _para_heading_number(p)
257
+ if n is None:
258
+ continue
259
+ if start is None and n == target:
260
+ start = i
261
+ continue
262
+ if start is not None and not _is_descendant_section(n, target):
263
+ return (start, i)
264
+ return (start, len(paras)) if start is not None else (None, None)
265
+
266
+
267
+ def _enclosing_heading(doc, para):
268
+ """Walk backward from para to the first preceding heading paragraph.
269
+ Returns the heading Paragraph or None. Used for HINT lines."""
270
+ paras = doc.paragraphs
271
+ target_elem = para._element
272
+ start_idx = None
273
+ for i, p in enumerate(paras):
274
+ if p._element is target_elem:
275
+ start_idx = i
276
+ break
277
+ if start_idx is None:
278
+ return None
279
+ for i in range(start_idx, -1, -1):
280
+ if _para_heading_number(paras[i]) is not None:
281
+ return paras[i]
282
+ return None
283
+
284
+
285
+ def _find_para_in_section(doc, search_text, section_number, prefer_not_in_table=False):
286
+ """Section-restricted _find_para. Returns (para, conf, status) where
287
+ status ∈ {"in_section", "no_section"}. On no_section, caller should
288
+ fall back to global _find_para with a WARN log line."""
289
+ if not section_number:
290
+ return (None, 0.0, 'no_section')
291
+ start, end = _section_range(doc, section_number)
292
+ if start is None:
293
+ return (None, 0.0, 'no_section')
294
+ para, conf = _match_paragraphs(doc.paragraphs[start:end], search_text,
295
+ prefer_not_in_table)
296
+ return (para, conf, 'in_section')
297
+
298
+
299
+ def _find_para_with_section(doc, search_text, section_number, kind_label, log,
300
+ prefer_not_in_table=False):
301
+ """Section-aware anchor search with WARN/ERROR logging.
302
+ Behaviour:
303
+ * section_number present + found in TS + anchor in range β†’ return (para, conf).
304
+ * section_number present + not in TS β†’ WARN, fall back to global _find_para.
305
+ * section_number present + anchor NOT in range β†’ ERROR + HINT, return (None, 0).
306
+ * section_number missing β†’ WARN, fall back to global _find_para.
307
+ Logs go to `log` (list of str)."""
308
+ if section_number:
309
+ para, conf, status = _find_para_in_section(
310
+ doc, search_text, section_number, prefer_not_in_table)
311
+ if status == 'in_section' and para is not None:
312
+ return para, conf
313
+ if status == 'no_section':
314
+ log.append(f" WARN section '{section_number}' not found in TS β€” falling back to global search")
315
+ return _find_para(doc, search_text, prefer_not_in_table)
316
+ # in_section but anchor absent β€” check global for HINT
317
+ g_para, _ = _find_para(doc, search_text, prefer_not_in_table)
318
+ if g_para is not None:
319
+ enc = _enclosing_heading(doc, g_para)
320
+ actual = _para_heading_number(enc) if enc is not None else '?'
321
+ log.append(f" ERROR {kind_label}: anchor {search_text[:60]!r} declared in section "
322
+ f"{section_number} but found in section {actual}")
323
+ log.append(f" HINT nearest match: {g_para.text[:120]!r}")
324
+ else:
325
+ log.append(f" ERROR {kind_label}: anchor {search_text[:60]!r} not found in section "
326
+ f"{section_number} (or anywhere)")
327
+ return None, 0.0
328
+ log.append(f" WARN no section_number on change β€” global anchor search for {search_text[:60]!r}")
329
+ return _find_para(doc, search_text, prefer_not_in_table)
330
 
331
 
332
  def _find_table_by_section(doc, section_heading):
 
387
  return None, 0.0
388
 
389
 
390
+ def _disambiguate_by_context(all_rows, candidates, context_rows_before):
391
+ """Pick the candidate whose preceding rows best match context_rows_before.
392
+ context_rows_before: list of expected col-0 texts, closest-first.
393
+ Returns the best candidate index; falls back to candidates[0] on tie."""
394
+ best_score, best_idx = -1, candidates[0]
395
+ for idx in candidates:
396
+ score = 0
397
+ for depth, expected in enumerate(context_rows_before, start=1):
398
+ ctx_idx = idx - depth
399
+ if ctx_idx < 0 or not expected:
400
+ continue
401
+ cell0 = all_rows[ctx_idx].cells[0].text if all_rows[ctx_idx].cells else ''
402
+ if _norm(expected) in _norm(cell0) or _norm_ws(expected) in _norm_ws(cell0):
403
+ score += 1
404
+ if score > best_score:
405
+ best_score, best_idx = score, idx
406
+ return best_idx
407
+
408
+
409
+ def _find_row(tbl, anchor_text, context_rows_before=None):
410
  """
411
  Find first row in tbl where col-0 cell text contains anchor_text.
412
  Returns (row_idx, confidence) or (-1, 0.0).
413
 
414
+ When context_rows_before is provided and multiple rows match, uses the
415
+ col-0 texts of the rows preceding each candidate to disambiguate.
416
+
417
  Matching levels, in order of confidence:
418
  1.0 β€” exact substring match
419
  0.9 β€” Unicode-normalised match (_norm: xa0, dashes, quotes, …)
 
424
  0.5 β€” clean-prefix + token-overlap: when multiple rows share the prefix,
425
  pick the one whose col-0 tokens overlap most with the anchor tokens.
426
  """
427
+ all_rows = list(tbl.rows)
428
  norm_anchor = _norm(anchor_text)
429
  ws_anchor = _norm_ws(anchor_text)
430
  alnum_anchor = _norm_alnum(anchor_text)
431
+
432
+ for match_fn, conf in [
433
+ (lambda c: anchor_text in c, 1.0),
434
+ (lambda c: bool(norm_anchor) and norm_anchor in _norm(c), 0.9),
435
+ (lambda c: bool(ws_anchor) and ws_anchor in _norm_ws(c), 0.8),
436
+ (lambda c: bool(alnum_anchor) and alnum_anchor in _norm_alnum(c), 0.6),
437
+ ]:
438
+ candidates = [
439
+ idx for idx, row in enumerate(all_rows)
440
+ if row.cells and match_fn(row.cells[0].text)
441
+ ]
442
+ if not candidates:
443
+ continue
444
+ if len(candidates) == 1 or not context_rows_before:
445
+ return candidates[0], conf
446
+ return _disambiguate_by_context(all_rows, candidates, context_rows_before), conf
447
 
448
  # ── Prefix-based partial match ─────────────────────────────────────────────
 
 
 
 
 
 
 
449
  prefix = _clean_prefix(anchor_text)
450
  if prefix and len(prefix) > 8:
451
  prefix_low = prefix.lower()
452
  hits = [
453
+ idx for idx, row in enumerate(all_rows)
454
  if row.cells and prefix_low in row.cells[0].text.lower()
455
  ]
456
  if len(hits) == 1:
 
460
  best_score, best_idx = -1, -1
461
  for hit_idx in hits:
462
  cell_tokens = set(re.findall(r'[a-z0-9]+',
463
+ all_rows[hit_idx].cells[0].text.lower()))
464
  score = len(anchor_tokens & cell_tokens)
465
  if score > best_score:
466
  best_score, best_idx = score, hit_idx
 
572
  loc = change['location']
573
  del_heading = loc.get('del_heading', '')
574
  has_del_table = loc.get('has_del_table', False)
575
+ section_number = loc.get('section_number', '')
576
  elements_xml = change.get('elements_xml', [])
577
 
578
  if not elements_xml:
579
  log.append(' SKIP section_replace: no elements in manifest')
580
  return False
581
 
582
+ # ── Resolve search scope: restrict to declared section if possible ─────────
583
+ search_paras = doc.paragraphs
584
+ section_status = 'no_section_required'
585
+ if section_number:
586
+ start, end = _section_range(doc, section_number)
587
+ if start is not None:
588
+ search_paras = doc.paragraphs[start:end]
589
+ section_status = 'in_section'
590
+ else:
591
+ log.append(f" WARN section '{section_number}' not found in TS β€” falling back to global search")
592
+ section_status = 'section_not_in_ts'
593
+ else:
594
+ log.append(" WARN no section_number on section_replace β€” global search")
595
+
596
  # ── Find the TS paragraph that matches the deleted heading ─────────────────
597
  ts_para_elem = None
598
+ insert_after_anchor = False # when True: insert after anchor, don't delete it
599
  if del_heading:
600
+ for para in search_paras:
601
  pt = para.text
602
  if del_heading in pt or _norm(del_heading) in _norm(pt):
603
  ts_para_elem = para._element
604
  break
605
  if ts_para_elem is None:
606
  # Fallback: include paragraphs whose XML text (inc. del runs) matches
607
+ for para in search_paras:
608
  if del_heading in _full_para_text(para):
609
  ts_para_elem = para._element
610
  break
611
+ else:
612
+ # No heading to delete β€” use anchor_text to find insertion point
613
+ anchor_text = loc.get('anchor_text', '')
614
+ if anchor_text:
615
+ if section_status == 'in_section':
616
+ anchor_para, _, _ = _find_para_in_section(
617
+ doc, anchor_text, section_number)
618
+ else:
619
+ anchor_para, _ = _find_para(doc, anchor_text)
620
+ if anchor_para is not None:
621
+ ts_para_elem = anchor_para._element
622
+ insert_after_anchor = True
623
 
624
  if ts_para_elem is None:
625
+ # Section mismatch check: if declared section exists, but del_heading
626
+ # is found GLOBALLY in a different section, report that.
627
+ if section_status == 'in_section' and del_heading:
628
+ for para in doc.paragraphs:
629
+ pt = para.text
630
+ if del_heading in pt or del_heading in _full_para_text(para):
631
+ enc = _enclosing_heading(doc, para)
632
+ actual = _para_heading_number(enc) if enc is not None else '?'
633
+ log.append(f' ERROR section_replace: del_heading {del_heading!r} declared in section '
634
+ f'{section_number} but found in section {actual}')
635
+ log.append(f" HINT nearest match: {para.text[:120]!r}")
636
+ return False
637
  log.append(f' ERROR section_replace: del_heading {del_heading!r} not found in TS')
638
+ tokens = del_heading.split()[:3] if del_heading else []
639
+ if tokens:
640
+ _hints = sorted(
641
+ [p for p in doc.paragraphs if any(tok in p.text for tok in tokens)],
642
+ key=lambda p: -len(set(del_heading.split()) & set(p.text.split()))
643
+ )[:3]
644
+ for _h in _hints:
645
+ log.append(f" HINT nearest match: {_h.text[:120]!r}")
646
  return False
647
 
648
  ts_body = ts_para_elem.getparent()
 
694
  el.set(qn('w:id'), rev.next())
695
  cloned.append(cloned_elem)
696
 
697
+ # ── Insert cloned elements before (or after) the anchor paragraph ─────────
698
  insert_idx = list(ts_body).index(ts_para_elem)
699
+ if insert_after_anchor:
700
+ insert_idx += 1 # insert after anchor, not before it
701
  for i, elem in enumerate(cloned):
702
  ts_body.insert(insert_idx + i, elem)
703
 
704
+ # ── Remove the now-replaced TS elements (only when a heading was deleted) ──
705
+ if not insert_after_anchor:
706
+ ts_body.remove(ts_para_elem)
707
  if ts_tbl_elem is not None:
708
  ts_body.remove(ts_tbl_elem)
709
 
 
802
 
803
  elif loc['kind'] == 'body_para':
804
  ctx = loc.get('para_context', '')
805
+ section_number = loc.get('section_number', '')
806
+ if len(old) < 4 and ctx:
807
+ # Short old text matches too broadly (e.g. a single digit would hit
808
+ # the title paragraph). Locate by context first, then verify old
809
+ # text is present in that paragraph.
810
+ para, conf = _find_para_with_section(
811
+ doc, ctx, section_number, 'text_replace', log, prefer_not_in_table=True)
812
+ if para is None or old not in para.text:
813
+ para = None
814
+ else:
815
+ para, conf = _find_para_with_section(
816
+ doc, old, section_number, 'text_replace', log, prefer_not_in_table=True)
817
+ if para is None and ctx:
818
+ para, conf = _find_para_with_section(
819
+ doc, ctx, section_number, 'text_replace', log, prefer_not_in_table=True)
820
  if para is None:
821
+ log.append(f" ERROR text_replace: old text {old!r} not found in TS")
822
+ return False
 
 
 
823
  if old in para.text:
824
  tracked_modify_para(para, old, new, rev, author, date)
825
  log.append(f" OK text_replace (body_para conf={conf:.1f}): {old!r} β†’ {new!r}")
 
832
 
833
 
834
  def _apply_para_insert(doc, change, rev, author, date, log):
835
+ loc = change['location']
836
+ anchor_text = loc.get('anchor_text', '')
837
+ section_number = loc.get('section_number', '')
838
  paras_data = change.get('paragraphs', [])
839
  if not paras_data:
840
  return True
841
 
842
+ anchor_para, conf = _find_para_with_section(
843
+ doc, anchor_text, section_number, 'para_insert', log)
844
  if anchor_para is None:
845
+ # When no section_number context, emit the legacy ERROR + HINT lines
846
+ if not section_number:
847
+ log.append(f" ERROR para_insert: anchor not found {anchor_text[:60]!r}")
848
+ tokens = anchor_text.split()[:3]
849
+ _hints = sorted(
850
+ [p for p in doc.paragraphs if any(tok in p.text for tok in tokens)],
851
+ key=lambda p: -len(set(anchor_text.split()) & set(p.text.split()))
852
+ )[:3]
853
+ for _h in _hints:
854
+ log.append(f" HINT nearest match: {_h.text[:120]!r}")
855
  return False
856
 
857
  items = [(p['text'], p['style'] or 'Normal') for p in paras_data]
 
877
  return False
878
 
879
  after_anchor = loc.get('after_row_anchor', '')
880
+ context_rows_before = loc.get('context_rows_before', [])
881
+ row_idx, r_conf = _find_row(tbl, after_anchor, context_rows_before)
882
  if row_idx < 0:
883
  log.append(f" ERROR row_insert: anchor row not found {after_anchor!r}")
884
  return False
 
942
  def apply_manifest(ts_path, manifest, out_path, author=DEFAULT_AUTHOR, date=DEFAULT_DATE):
943
  """
944
  Apply all changes in manifest to ts_path, save to out_path.
945
+ Returns (n_ok, n_skipped, log_lines, n_parsed, n_merged_groups).
946
  """
947
  doc = docx.Document(str(ts_path))
948
  rev = RevCounter(doc)
 
950
  n_ok = 0
951
  n_skip = 0
952
 
953
+ n_parsed = len(manifest)
954
  manifest = _merge_para_inserts(manifest)
955
+ n_merged = len(manifest)
956
 
957
  # Track last inserted <w:tr> per (tbl_id, anchor_row_idx) to maintain
958
  # forward insertion order when multiple row_inserts target the same anchor.
 
979
  n_skip += 1
980
 
981
  doc.save(str(out_path))
982
+ return n_ok, n_skip, log, n_parsed, n_merged
983
 
984
 
985
  # ── CLI ───────────────────────────────────────────────────────────────────────
 
999
  manifest = json.load(f)
1000
 
1001
  print(f'Applying {len(manifest)} change(s) from manifest to {ts_path.name}...')
1002
+ n_ok, n_skip, log, n_parsed, n_merged = apply_manifest(ts_path, manifest, out_path, author=args.author)
1003
 
1004
  for line in log:
1005
  print(line)
1006
+ print(f'\nParsed: {n_parsed} body changes (merged to {n_merged} groups) β†’ Applied: {n_ok} Skipped: {n_skip}')
1007
  print(f'Output: {out_path}')
1008
 
1009