heymenn commited on
Commit
b5fc740
Β·
1 Parent(s): 8692cb7

reduce codebase size, prevent errors and unknowns and configure new ETSI download process

Browse files
app.py CHANGED
@@ -264,6 +264,22 @@ def load_hf_index_cached(hf_token: str, hf_repo: str) -> list[dict]:
264
  return st.session_state[key]
265
 
266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  # ── Page config ───────────────────────────────────────────────────────────────
268
  st.set_page_config(
269
  page_title="CR Application Tool",
@@ -454,29 +470,11 @@ elif status == "upload":
454
  env["EOL_PASSWORD"] = st.session_state.eol_password
455
  # HF_TOKEN is already in env via os.environ
456
 
457
- log_file = open(index_log, "w")
458
- proc = subprocess.Popen(
459
- cmd,
460
- stdout=log_file,
461
- stderr=subprocess.STDOUT,
462
- env=env,
463
- )
464
- log_file.close()
465
-
466
- threading.Thread(
467
- target=_run_and_save_rc,
468
- args=(proc, rc_path),
469
- daemon=True,
470
- ).start()
471
- st.session_state.proc = proc
472
-
473
- state["status"] = "indexing"
474
- state["pid"] = proc.pid
475
- state["index_log"] = index_log
476
- state["output_dir"] = "" # no pipeline output yet
477
- state["started_at"] = datetime.now().isoformat()
478
- save_state(sid, state)
479
- st.rerun()
480
 
481
  # ════════════════════════════════════════════════════════════════════════════
482
  # INDEXING (build_cr_index.py running)
@@ -588,31 +586,13 @@ elif status == "ts_select":
588
  env["EOL_PASSWORD"] = st.session_state.eol_password
589
  # HF_TOKEN already in env via os.environ
590
 
591
- log_file = open(str(log_path), "w")
592
- proc = subprocess.Popen(
593
- cmd,
594
- stdout=log_file,
595
- stderr=subprocess.STDOUT,
596
- env=env,
597
- )
598
- log_file.close()
599
-
600
- threading.Thread(
601
- target=_run_and_save_rc,
602
- args=(proc, rc_path),
603
- daemon=True,
604
- ).start()
605
- st.session_state.proc = proc
606
-
607
- state["ts_id"] = selected_spec
608
- state["status"] = "running"
609
- state["pid"] = proc.pid
610
- state["output_dir"] = str(output_dir)
611
- state["log_path"] = str(log_path)
612
- state["run_log_paths"] = [str(log_path)]
613
- state["started_at"] = datetime.now().isoformat()
614
- save_state(sid, state)
615
- st.rerun()
616
 
617
  # ════════════════════════════════════════════════════════════════════════════
618
  # PREVIEW
@@ -671,32 +651,12 @@ elif status == "preview":
671
  env["EOL_USER"] = st.session_state.eol_user
672
  env["EOL_PASSWORD"] = st.session_state.eol_password
673
 
674
- log_file = open(str(log_path), "w")
675
- proc = subprocess.Popen(
676
- cmd,
677
- stdout=log_file,
678
- stderr=subprocess.STDOUT,
679
- env=env,
680
- )
681
- log_file.close()
682
-
683
- # Background thread writes returncode file when process finishes
684
- threading.Thread(
685
- target=_run_and_save_rc,
686
- args=(proc, rc_path),
687
- daemon=True,
688
- ).start()
689
-
690
- st.session_state.proc = proc
691
-
692
- state["status"] = "running"
693
- state["pid"] = proc.pid
694
- state["output_dir"] = str(output_dir)
695
- state["log_path"] = str(log_path)
696
- state["run_log_paths"] = [str(log_path)]
697
- state["started_at"] = datetime.now().isoformat()
698
- save_state(sid, state)
699
- st.rerun()
700
 
701
  # ════════════════════════════════════════════════════════════════════════════
702
  # RUNNING
@@ -873,8 +833,6 @@ elif status in ("done", "error"):
873
 
874
  if st.button("β–Ά Apply CRs to recovered TSs", type="primary"):
875
  retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
876
- _rc_path(sid).unlink(missing_ok=True) # clear old returncode
877
-
878
  cmd = [
879
  sys.executable,
880
  str(SCRIPTS_DIR / "orchestrate_cr.py"),
@@ -885,26 +843,11 @@ elif status in ("done", "error"):
885
  env["EOL_USER"] = st.session_state.eol_user
886
  env["EOL_PASSWORD"] = st.session_state.eol_password
887
 
888
- log_file = open(retry_log, "w")
889
- proc = subprocess.Popen(
890
- cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env
891
- )
892
- log_file.close()
893
-
894
- threading.Thread(
895
- target=_run_and_save_rc,
896
- args=(proc, _rc_path(sid)),
897
- daemon=True,
898
- ).start()
899
- st.session_state.proc = proc
900
-
901
- state["status"] = "running"
902
- state["pid"] = proc.pid
903
- state["log_path"] = retry_log
904
- state["run_log_paths"] = state.get("run_log_paths", []) + [retry_log]
905
- state["started_at"] = datetime.now().isoformat()
906
- save_state(sid, state)
907
- st.rerun()
908
  else:
909
  st.warning("No TSs available yet β€” retry download or upload DOCX files above.")
910
 
 
264
  return st.session_state[key]
265
 
266
 
267
+ def _launch_proc(cmd, env, log_path, sid, state, extra_state: dict):
268
+ """Open log_path, Popen cmd, start rc-writer thread, update state, rerun."""
269
+ rc_path = _rc_path(sid)
270
+ rc_path.unlink(missing_ok=True)
271
+ log_file = open(str(log_path), "w")
272
+ proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT, env=env)
273
+ log_file.close()
274
+ threading.Thread(target=_run_and_save_rc, args=(proc, rc_path), daemon=True).start()
275
+ st.session_state.proc = proc
276
+ state.update(extra_state)
277
+ state["pid"] = proc.pid
278
+ state["started_at"] = datetime.now().isoformat()
279
+ save_state(sid, state)
280
+ st.rerun()
281
+
282
+
283
  # ── Page config ───────────────────────────────────────────────────────────────
284
  st.set_page_config(
285
  page_title="CR Application Tool",
 
470
  env["EOL_PASSWORD"] = st.session_state.eol_password
471
  # HF_TOKEN is already in env via os.environ
472
 
473
+ _launch_proc(cmd, env, index_log, sid, state, {
474
+ "status": "indexing",
475
+ "index_log": index_log,
476
+ "output_dir": "", # no pipeline output yet
477
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  # ════════════════════════════════════════════════════════════════════════════
480
  # INDEXING (build_cr_index.py running)
 
586
  env["EOL_PASSWORD"] = st.session_state.eol_password
587
  # HF_TOKEN already in env via os.environ
588
 
589
+ _launch_proc(cmd, env, log_path, sid, state, {
590
+ "ts_id": selected_spec,
591
+ "status": "running",
592
+ "output_dir": str(output_dir),
593
+ "log_path": str(log_path),
594
+ "run_log_paths": [str(log_path)],
595
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  # ════════════════════════════════════════════════════════════════════════════
598
  # PREVIEW
 
651
  env["EOL_USER"] = st.session_state.eol_user
652
  env["EOL_PASSWORD"] = st.session_state.eol_password
653
 
654
+ _launch_proc(cmd, env, log_path, sid, state, {
655
+ "status": "running",
656
+ "output_dir": str(output_dir),
657
+ "log_path": str(log_path),
658
+ "run_log_paths": [str(log_path)],
659
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
660
 
661
  # ════════════════════════════════════════════════════════════════════════════
662
  # RUNNING
 
833
 
834
  if st.button("β–Ά Apply CRs to recovered TSs", type="primary"):
835
  retry_log = str(session_dir(sid) / f"pipeline_{int(time.time())}_retry.log")
 
 
836
  cmd = [
837
  sys.executable,
838
  str(SCRIPTS_DIR / "orchestrate_cr.py"),
 
843
  env["EOL_USER"] = st.session_state.eol_user
844
  env["EOL_PASSWORD"] = st.session_state.eol_password
845
 
846
+ _launch_proc(cmd, env, retry_log, sid, state, {
847
+ "status": "running",
848
+ "log_path": retry_log,
849
+ "run_log_paths": state.get("run_log_paths", []) + [retry_log],
850
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
851
  else:
852
  st.warning("No TSs available yet β€” retry download or upload DOCX files above.")
853
 
scripts/cr_parser.py CHANGED
@@ -301,7 +301,7 @@ def _parse_body(body, changes):
301
  sec_anchor = ''
302
 
303
  def flush_section():
304
- nonlocal sec_state, sec_anchor
305
  if not sec_del and not sec_ins:
306
  sec_del.clear(); sec_sep.clear(); sec_ins.clear()
307
  sec_state = 'stable'
@@ -315,6 +315,17 @@ def _parse_body(body, changes):
315
  if t:
316
  del_heading = t
317
  break
 
 
 
 
 
 
 
 
 
 
 
318
  # Serialize all elements for the manifest (del + sep + ins)
319
  all_elems = sec_del + sec_sep + sec_ins
320
  elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
@@ -332,6 +343,14 @@ def _parse_body(body, changes):
332
  },
333
  'elements_xml': elements_xml,
334
  })
 
 
 
 
 
 
 
 
335
  sec_del.clear(); sec_sep.clear(); sec_ins.clear()
336
  sec_state = 'stable'
337
 
 
301
  sec_anchor = ''
302
 
303
  def flush_section():
304
+ nonlocal sec_state, sec_anchor, prev_stable_text
305
  if not sec_del and not sec_ins:
306
  sec_del.clear(); sec_sep.clear(); sec_ins.clear()
307
  sec_state = 'stable'
 
315
  if t:
316
  del_heading = t
317
  break
318
+ # Fallback: if first deleted element was a table, use its first cell text
319
+ if not del_heading:
320
+ for e in sec_del:
321
+ tag = e.tag.split('}')[-1] if '}' in e.tag else e.tag
322
+ if tag == 'tbl':
323
+ first_tc = e.find('.//' + qn('w:tc'))
324
+ if first_tc is not None:
325
+ p = first_tc.find('.//' + qn('w:p'))
326
+ del_heading = (_para_new_text(p) if p is not None
327
+ else _para_new_text(first_tc)).strip()
328
+ break
329
  # Serialize all elements for the manifest (del + sep + ins)
330
  all_elems = sec_del + sec_sep + sec_ins
331
  elements_xml = [etree.tostring(e, encoding='unicode') for e in all_elems]
 
343
  },
344
  'elements_xml': elements_xml,
345
  })
346
+ # Refresh anchor so subsequent para_insert targets the new text, not the deleted one
347
+ if sec_ins:
348
+ last_p = next((e for e in reversed(sec_ins)
349
+ if (e.tag.split('}')[-1] if '}' in e.tag else e.tag) == 'p'), None)
350
+ if last_p is not None:
351
+ candidate = _para_new_text(last_p).strip()
352
+ if candidate:
353
+ prev_stable_text = candidate
354
  sec_del.clear(); sec_sep.clear(); sec_ins.clear()
355
  sec_state = 'stable'
356
 
scripts/docx_helpers.py CHANGED
@@ -75,7 +75,7 @@ def map_sections(doc, clause_numbers):
75
 
76
  for i, para in enumerate(doc.paragraphs):
77
  text = para.text.strip()
78
- style = para.style.name
79
 
80
  matched = False
81
  for clause in clause_numbers:
@@ -418,9 +418,12 @@ def tracked_insert_table_row(tbl, cell_texts, rev, author=AUTHOR, date=DATE):
418
 
419
  # Find the last row that contains at least one non-empty <w:t> node.
420
  # This skips pre-allocated blank rows at the table bottom.
 
421
  last_content_tr = all_trs[-1]
422
  for tr in reversed(all_trs):
423
- if any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t'))):
 
 
424
  last_content_tr = tr
425
  break
426
 
 
75
 
76
  for i, para in enumerate(doc.paragraphs):
77
  text = para.text.strip()
78
+ style = para.style.name if para.style else 'Normal'
79
 
80
  matched = False
81
  for clause in clause_numbers:
 
418
 
419
  # Find the last row that contains at least one non-empty <w:t> node.
420
  # This skips pre-allocated blank rows at the table bottom.
421
+ _CONTENT_TAGS = {qn('w:t'), qn('w:hyperlink'), qn('w:drawing'), qn('w:object')}
422
  last_content_tr = all_trs[-1]
423
  for tr in reversed(all_trs):
424
+ has_text = any(t.text and t.text.strip() for t in tr.findall('.//' + qn('w:t')))
425
+ has_other = any(el.tag in _CONTENT_TAGS for el in tr.iter())
426
+ if has_text or has_other:
427
  last_content_tr = tr
428
  break
429
 
scripts/etsi_client.py CHANGED
@@ -267,8 +267,6 @@ class ETSISpecFinder:
267
  today = datetime.date.today().isoformat()
268
 
269
  base_params = {
270
- "option": "com_standardssearch",
271
- "view": "data",
272
  "format": "json",
273
  "page": "1",
274
  "title": "1",
@@ -304,9 +302,12 @@ class ETSISpecFinder:
304
  params = {**base_params, "search": query}
305
  try:
306
  resp = requests.get(
307
- "https://www.etsi.org/",
308
  params=params,
309
- headers=self.headers,
 
 
 
310
  verify=False,
311
  timeout=15,
312
  proxies=_get_proxies(),
 
267
  today = datetime.date.today().isoformat()
268
 
269
  base_params = {
 
 
270
  "format": "json",
271
  "page": "1",
272
  "title": "1",
 
302
  params = {**base_params, "search": query}
303
  try:
304
  resp = requests.get(
305
+ "https://www.etsi.org/custom/standardssearch/data.php",
306
  params=params,
307
+ headers={
308
+ **self.headers,
309
+ "Referer": "https://www.etsi.org/standards/",
310
+ },
311
  verify=False,
312
  timeout=15,
313
  proxies=_get_proxies(),
scripts/fetch_crs.py CHANGED
@@ -67,9 +67,9 @@ def parse_excel_all_accepted(excel_path: str):
67
  ext = path.suffix.lower()
68
 
69
  if ext == ".xls":
70
- return _parse_xls_all(path)
71
  elif ext == ".xlsx":
72
- return _parse_xlsx_all(path)
73
  else:
74
  raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
75
 
@@ -78,7 +78,12 @@ def _name_pattern(name: str) -> re.Pattern:
78
  return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
79
 
80
 
81
- def _parse_xls(path: Path, person_name: str):
 
 
 
 
 
82
  try:
83
  import xlrd
84
  except ImportError:
@@ -101,12 +106,15 @@ def _parse_xls(path: Path, person_name: str):
101
  by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
102
  title_col = col.get("Title") or col.get("title")
103
 
104
- for name, c in [("Uid", uid_col), ("Type", type_col),
105
- ("Status", status_col), ("SubmittedBy", by_col)]:
106
- if c is None:
 
 
 
107
  raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
108
 
109
- pattern = _name_pattern(person_name)
110
  results = []
111
 
112
  for r in range(2, ws.nrows): # skip header + empty duplicate
@@ -120,15 +128,20 @@ def _parse_xls(path: Path, person_name: str):
120
  continue
121
  if status != "Accepted":
122
  continue
123
- if not pattern.search(submitted_by):
124
  continue
125
 
126
- results.append((uid, title))
127
 
128
  return results
129
 
130
 
131
- def _parse_xlsx(path: Path, person_name: str):
 
 
 
 
 
132
  try:
133
  import openpyxl
134
  except ImportError:
@@ -153,12 +166,15 @@ def _parse_xlsx(path: Path, person_name: str):
153
  by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
154
  title_col = col.get("Title") or col.get("title")
155
 
156
- for name, c in [("Uid", uid_col), ("Type", type_col),
157
- ("Status", status_col), ("SubmittedBy", by_col)]:
158
- if c is None:
 
 
 
159
  raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
160
 
161
- pattern = _name_pattern(person_name)
162
  results = []
163
 
164
  for row in rows:
@@ -178,107 +194,10 @@ def _parse_xlsx(path: Path, person_name: str):
178
  continue
179
  if status != "Accepted":
180
  continue
181
- if not pattern.search(submitted_by):
182
  continue
183
 
184
- results.append((uid, title))
185
-
186
- return results
187
-
188
-
189
- def _parse_xls_all(path: Path):
190
- """Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
191
- try:
192
- import xlrd
193
- except ImportError:
194
- sys.exit("ERROR: xlrd is not installed. Run: pip install xlrd")
195
-
196
- wb = xlrd.open_workbook(str(path))
197
- try:
198
- ws = wb.sheet_by_name("Contributions")
199
- except xlrd.XLRDError:
200
- ws = wb.sheet_by_index(0)
201
-
202
- headers = [str(ws.cell_value(0, c)).strip() for c in range(ws.ncols)]
203
- col = {h: i for i, h in enumerate(headers)}
204
-
205
- uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
206
- type_col = col.get("Type") or col.get("type")
207
- status_col = col.get("Status") or col.get("status")
208
- by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
209
- title_col = col.get("Title") or col.get("title")
210
-
211
- for name, c in [("Uid", uid_col), ("Type", type_col),
212
- ("Status", status_col), ("SubmittedBy", by_col)]:
213
- if c is None:
214
- raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
215
-
216
- results = []
217
- for r in range(2, ws.nrows):
218
- uid = str(ws.cell_value(r, uid_col)).strip()
219
- doc_type = str(ws.cell_value(r, type_col)).strip()
220
- status = str(ws.cell_value(r, status_col)).strip()
221
- submitted_by = str(ws.cell_value(r, by_col)).strip()
222
- title = str(ws.cell_value(r, title_col)).strip() if title_col is not None else ""
223
-
224
- if doc_type != "CR":
225
- continue
226
- if status != "Accepted":
227
- continue
228
-
229
- results.append((uid, title, submitted_by))
230
-
231
- return results
232
-
233
-
234
- def _parse_xlsx_all(path: Path):
235
- """Return (uid, title, submitted_by) for all Accepted CRs (no person filter)."""
236
- try:
237
- import openpyxl
238
- except ImportError:
239
- sys.exit("ERROR: openpyxl is not installed. Run: pip install openpyxl")
240
-
241
- wb = openpyxl.load_workbook(str(path), read_only=True, data_only=True)
242
- ws = wb["Contributions"] if "Contributions" in wb.sheetnames else wb.active
243
-
244
- rows = iter(ws.iter_rows(values_only=True))
245
- header_row = next(rows)
246
- headers = [str(h).strip() if h is not None else "" for h in header_row]
247
- col = {h: i for i, h in enumerate(headers)}
248
-
249
- next(rows, None) # skip empty duplicate row
250
-
251
- uid_col = col.get("Uid") or col.get("UID") or col.get("uid")
252
- type_col = col.get("Type") or col.get("type")
253
- status_col = col.get("Status") or col.get("status")
254
- by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
255
- title_col = col.get("Title") or col.get("title")
256
-
257
- for name, c in [("Uid", uid_col), ("Type", type_col),
258
- ("Status", status_col), ("SubmittedBy", by_col)]:
259
- if c is None:
260
- raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
261
-
262
- results = []
263
- for row in rows:
264
- def cell(c):
265
- v = row[c] if c < len(row) else None
266
- return str(v).strip() if v is not None else ""
267
-
268
- uid = cell(uid_col)
269
- doc_type = cell(type_col)
270
- status = cell(status_col)
271
- submitted_by = cell(by_col)
272
- title = cell(title_col) if title_col is not None else ""
273
-
274
- if not uid:
275
- continue
276
- if doc_type != "CR":
277
- continue
278
- if status != "Accepted":
279
- continue
280
-
281
- results.append((uid, title, submitted_by))
282
 
283
  return results
284
 
@@ -336,8 +255,29 @@ def download_cr(uid: str, cr_dir: Path, eol_user: str, eol_password: str):
336
  # Step 3 β€” Parse CR Cover Pages
337
  # ---------------------------------------------------------------------------
338
 
339
- SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}$")
340
- VERSION_PATTERN = re.compile(r"^\d+\.\d+\.\d+$")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
 
343
  def parse_cr_cover(docx_path: Path):
@@ -360,7 +300,9 @@ def parse_cr_cover(docx_path: Path):
360
  if not doc.tables:
361
  return None, None
362
 
363
- table = doc.tables[0]
 
 
364
 
365
  # Collect all non-empty cell texts in order
366
  cells = []
@@ -376,20 +318,36 @@ def parse_cr_cover(docx_path: Path):
376
  version = None
377
 
378
  for i, text in enumerate(cells):
379
- # Look for spec number: "NNN NNN" pattern
380
- if SPEC_PATTERN.match(text) and spec_number is None:
381
  spec_number = text
382
 
383
- # Look for version: cell immediately after "Current version:"
 
 
 
 
 
 
 
 
384
  if "Current version:" in text and i + 1 < len(cells):
385
  candidate = cells[i + 1]
386
  if VERSION_PATTERN.match(candidate):
387
- version = candidate
388
 
389
- # Also accept "Current version" without colon
390
  if text in ("Current version:", "Current version") and version is None:
391
  if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
392
- version = cells[i + 1]
 
 
 
 
 
 
 
 
 
393
 
394
  if spec_number:
395
  spec_number = spec_number.replace('\xa0', ' ').strip()
 
67
  ext = path.suffix.lower()
68
 
69
  if ext == ".xls":
70
+ return _parse_xls(path)
71
  elif ext == ".xlsx":
72
+ return _parse_xlsx(path)
73
  else:
74
  raise ValueError(f"Unsupported file extension: {ext!r}. Expected .xls or .xlsx")
75
 
 
78
  return re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
79
 
80
 
81
+ def _parse_xls(path: Path, person_name: str | None = None):
82
+ """
83
+ Return Accepted CRs from an .xls file.
84
+ If person_name is given: return (uid, title) filtered to that name.
85
+ If person_name is None: return (uid, title, submitted_by) for all.
86
+ """
87
  try:
88
  import xlrd
89
  except ImportError:
 
106
  by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
107
  title_col = col.get("Title") or col.get("title")
108
 
109
+ for name, c, required in [
110
+ ("Uid", uid_col, True), ("Type", type_col, True),
111
+ ("Status", status_col, True), ("SubmittedBy", by_col, True),
112
+ ("Title", title_col, False),
113
+ ]:
114
+ if c is None and required:
115
  raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
116
 
117
+ pattern = _name_pattern(person_name) if person_name else None
118
  results = []
119
 
120
  for r in range(2, ws.nrows): # skip header + empty duplicate
 
128
  continue
129
  if status != "Accepted":
130
  continue
131
+ if pattern and not pattern.search(submitted_by):
132
  continue
133
 
134
+ results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
135
 
136
  return results
137
 
138
 
139
+ def _parse_xlsx(path: Path, person_name: str | None = None):
140
+ """
141
+ Return Accepted CRs from an .xlsx file.
142
+ If person_name is given: return (uid, title) filtered to that name.
143
+ If person_name is None: return (uid, title, submitted_by) for all.
144
+ """
145
  try:
146
  import openpyxl
147
  except ImportError:
 
166
  by_col = col.get("SubmittedBy") or col.get("Submitted By") or col.get("submittedby")
167
  title_col = col.get("Title") or col.get("title")
168
 
169
+ for name, c, required in [
170
+ ("Uid", uid_col, True), ("Type", type_col, True),
171
+ ("Status", status_col, True), ("SubmittedBy", by_col, True),
172
+ ("Title", title_col, False),
173
+ ]:
174
+ if c is None and required:
175
  raise ValueError(f"Column {name!r} not found. Available: {list(col.keys())}")
176
 
177
+ pattern = _name_pattern(person_name) if person_name else None
178
  results = []
179
 
180
  for row in rows:
 
194
  continue
195
  if status != "Accepted":
196
  continue
197
+ if pattern and not pattern.search(submitted_by):
198
  continue
199
 
200
+ results.append((uid, title) if person_name is not None else (uid, title, submitted_by))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  return results
203
 
 
255
  # Step 3 β€” Parse CR Cover Pages
256
  # ---------------------------------------------------------------------------
257
 
258
+ SPEC_PATTERN = re.compile(r"^\d{3}\s\d{3}(-\d+)*$") # "102 221" or "102 230-2"
259
+ SPEC_SEARCH = re.compile(r"\b\d{3}\s\d{3}(?:-\d+)*\b") # substring search fallback
260
+ VERSION_PATTERN = re.compile(r"^[Vv]?\d+\.\d+(\.\d+)?$") # X.Y or X.Y.Z, optional V prefix
261
+ VERSION_SEARCH = re.compile(r"\b\d+\.\d+\.\d+\b") # substring fallback
262
+
263
+
264
+ def _normalise_version(v: str) -> str:
265
+ """Strip optional V prefix and pad to X.Y.Z."""
266
+ v = v.lstrip('Vv')
267
+ parts = v.split('.')
268
+ while len(parts) < 3:
269
+ parts.append('0')
270
+ return '.'.join(parts[:3])
271
+
272
+
273
+ def _find_cover_table(doc):
274
+ """Return the CR cover table, scanning all tables for one containing CHANGE REQUEST."""
275
+ MARKERS = {"CHANGE REQUEST", "CR", "CHANGE REQUEST"}
276
+ for tbl in doc.tables:
277
+ cells_text = {c.text.strip() for row in tbl.rows for c in row.cells}
278
+ if cells_text & MARKERS:
279
+ return tbl
280
+ return None
281
 
282
 
283
  def parse_cr_cover(docx_path: Path):
 
300
  if not doc.tables:
301
  return None, None
302
 
303
+ table = _find_cover_table(doc)
304
+ if table is None:
305
+ return None, None
306
 
307
  # Collect all non-empty cell texts in order
308
  cells = []
 
318
  version = None
319
 
320
  for i, text in enumerate(cells):
321
+ # ── Strategy 1: exact cell match "NNN NNN" or "NNN NNN-N" ────────────
322
+ if spec_number is None and SPEC_PATTERN.match(text):
323
  spec_number = text
324
 
325
+ # ── Strategy 2: positional β€” cell immediately after "CHANGE REQUEST" ─
326
+ # The cover table always places the spec number in the cell right after
327
+ # the "CHANGE REQUEST" label.
328
+ if spec_number is None and text.strip() == "CHANGE REQUEST" and i + 1 < len(cells):
329
+ candidate = cells[i + 1].strip()
330
+ if SPEC_PATTERN.match(candidate):
331
+ spec_number = candidate
332
+
333
+ # ── Version: cell immediately after "Current version:" ───────────────
334
  if "Current version:" in text and i + 1 < len(cells):
335
  candidate = cells[i + 1]
336
  if VERSION_PATTERN.match(candidate):
337
+ version = _normalise_version(candidate)
338
 
 
339
  if text in ("Current version:", "Current version") and version is None:
340
  if i + 1 < len(cells) and VERSION_PATTERN.match(cells[i + 1]):
341
+ version = _normalise_version(cells[i + 1])
342
+
343
+ # ── Strategy 3: substring search across all cells ─────────────────────────
344
+ # Catches cases where the spec number is embedded in a longer cell string.
345
+ if spec_number is None:
346
+ for text in cells:
347
+ m = SPEC_SEARCH.search(text)
348
+ if m:
349
+ spec_number = m.group(0)
350
+ break
351
 
352
  if spec_number:
353
  spec_number = spec_number.replace('\xa0', ' ').strip()
scripts/finalize_ts.py CHANGED
@@ -27,6 +27,7 @@ from docx_helpers import (
27
  AUTHOR,
28
  DATE,
29
  )
 
30
 
31
 
32
  # ── Path helpers ──────────────────────────────────────────────────────────────
@@ -58,6 +59,8 @@ def compute_pub_date():
58
  def derive_new_version(v: str) -> str:
59
  """Increment middle component of X.Y.Z β†’ X.(Y+1).0."""
60
  parts = v.split('.')
 
 
61
  parts[1] = str(int(parts[1]) + 1)
62
  parts[2] = '0'
63
  return '.'.join(parts)
@@ -75,7 +78,9 @@ def extract_cr_metadata(cr_docx_path: str) -> dict:
75
  if not doc.tables:
76
  raise ValueError('CR has no tables β€” cannot extract metadata')
77
 
78
- tbl = doc.tables[0]
 
 
79
 
80
  # Collect all cell texts for scanning
81
  cells = []
@@ -165,10 +170,16 @@ def _detect_meeting_separator(tbl):
165
  number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
166
  Returns the detected separator character, defaulting to '#'.
167
  """
 
 
 
 
 
 
168
  for row in reversed(tbl.rows):
169
  cells = row.cells
170
- if len(cells) > 1:
171
- text = cells[1].text.strip()
172
  if text:
173
  m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
174
  if m:
@@ -198,7 +209,8 @@ def find_change_history_table(ts_doc):
198
  continue
199
  if tbl.rows:
200
  header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
201
- if any(kw in header_text for kw in ('cr', 'date', 'meeting', 'rev')):
 
202
  return tbl
203
  raise NoChangeHistoryTable(
204
  'No Change History table found in this document '
 
27
  AUTHOR,
28
  DATE,
29
  )
30
+ from fetch_crs import _find_cover_table
31
 
32
 
33
  # ── Path helpers ──────────────────────────────────────────────────────────────
 
59
  def derive_new_version(v: str) -> str:
60
  """Increment middle component of X.Y.Z β†’ X.(Y+1).0."""
61
  parts = v.split('.')
62
+ if len(parts) < 3:
63
+ parts += ['0'] * (3 - len(parts))
64
  parts[1] = str(int(parts[1]) + 1)
65
  parts[2] = '0'
66
  return '.'.join(parts)
 
78
  if not doc.tables:
79
  raise ValueError('CR has no tables β€” cannot extract metadata')
80
 
81
+ tbl = _find_cover_table(doc)
82
+ if tbl is None:
83
+ raise ValueError('CR cover table not found β€” no table containing "CHANGE REQUEST"')
84
 
85
  # Collect all cell texts for scanning
86
  cells = []
 
170
  number, e.g. '#' in 'SET#115' or '-' in 'SET-119'.
171
  Returns the detected separator character, defaulting to '#'.
172
  """
173
+ meet_col = 1 # default: standard ETSI Change History has Meeting in col 1
174
+ if tbl.rows:
175
+ for c_idx, cell in enumerate(tbl.rows[0].cells):
176
+ if any(kw in cell.text.lower() for kw in ('meeting', 'body', 'tsg')):
177
+ meet_col = c_idx
178
+ break
179
  for row in reversed(tbl.rows):
180
  cells = row.cells
181
+ if len(cells) > meet_col:
182
+ text = cells[meet_col].text.strip()
183
  if text:
184
  m = re.search(r'[A-Za-z]([^A-Za-z0-9])\d', text)
185
  if m:
 
209
  continue
210
  if tbl.rows:
211
  header_text = ' '.join(c.text.strip() for c in tbl.rows[0].cells).lower()
212
+ header_words = set(re.findall(r'\b\w+\b', header_text))
213
+ if {'cr', 'date'}.issubset(header_words):
214
  return tbl
215
  raise NoChangeHistoryTable(
216
  'No Change History table found in this document '
scripts/hf_cr_index.py CHANGED
@@ -44,15 +44,18 @@ def push_hf_index(records: list[dict], hf_token: str, hf_repo: str) -> None:
44
  exist_ok=True,
45
  token=hf_token,
46
  )
47
- tmp = Path("/tmp/cr_index.jsonl")
48
- tmp.write_text(
49
- "\n".join(json.dumps(r, ensure_ascii=False) for r in records),
50
- encoding="utf-8",
51
- )
52
- api.upload_file(
53
- path_or_fileobj=str(tmp),
54
- path_in_repo="cr_index.jsonl",
55
- repo_id=hf_repo,
56
- repo_type="dataset",
57
- token=hf_token,
58
- )
 
 
 
 
44
  exist_ok=True,
45
  token=hf_token,
46
  )
47
+ import tempfile
48
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl',
49
+ encoding='utf-8', delete=False) as _f:
50
+ _f.write("\n".join(json.dumps(r, ensure_ascii=False) for r in records))
51
+ _tmp_path = _f.name
52
+ try:
53
+ api.upload_file(
54
+ path_or_fileobj=_tmp_path,
55
+ path_in_repo="cr_index.jsonl",
56
+ repo_id=hf_repo,
57
+ repo_type="dataset",
58
+ token=hf_token,
59
+ )
60
+ finally:
61
+ Path(_tmp_path).unlink(missing_ok=True)
scripts/orchestrate_cr.py CHANGED
@@ -81,6 +81,173 @@ class _TeeWriter:
81
  self._real.flush()
82
 
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  # ── Shared Steps 2, 4, 5, 6 ──────────────────────────────────────────────────
85
 
86
  def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
@@ -182,144 +349,12 @@ def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
182
  report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
183
 
184
  for (spec_number, version), uids in ts_groups.items():
185
- ts_key = f'TS {spec_number} v{version}'
186
  spec_compact = spec_number.replace(' ', '')
187
  spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
188
  spec_dir.mkdir(parents=True, exist_ok=True)
189
-
190
- new_v = derive_new_version(version)
191
- stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
192
- ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
193
- ts_final = spec_dir / f'{stem}.docx'
194
- log_path = spec_dir / f'{stem}.log'
195
- errors = []
196
-
197
- print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
198
-
199
- if (spec_number, version) not in ts_paths:
200
- msg = 'TS download failed β€” skipping'
201
- print(f' SKIP: {msg}')
202
- report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
203
- continue
204
-
205
- ts_in = ts_paths[(spec_number, version)]
206
-
207
- log_buf = io.StringIO()
208
- tee = _TeeWriter(sys.stdout, log_buf)
209
-
210
- with contextlib.redirect_stdout(tee):
211
- log_header = (
212
- f'Pipeline Log\n'
213
- f'TS: {spec_number} v{version} -> v{new_v}\n'
214
- f'CRs: {", ".join(uids)}\n'
215
- f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
216
- f'{"=" * 60}\n'
217
- )
218
- print(log_header, end='')
219
-
220
- combined_manifest = []
221
- participating_uids = []
222
-
223
- for uid in uids:
224
- if uid not in cr_paths:
225
- errors.append(f'[{uid}] CR download had failed β€” skipped')
226
- continue
227
- print(f' Parsing {uid}... ', end='', flush=True)
228
- try:
229
- changes = parse_cr(cr_paths[uid])
230
- combined_manifest.extend(changes)
231
- participating_uids.append(uid)
232
- print(f'{len(changes)} change(s)')
233
- except Exception as e:
234
- errors.append(f'[{uid}] parse ERROR: {e}')
235
- print(f'ERROR: {e}')
236
-
237
- if not combined_manifest:
238
- print(' No changes parsed β€” skipping apply step.')
239
- report.append((ts_key, 0, 0, len(uids), None, log_path,
240
- errors + ['No changes parsed']))
241
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
242
- continue
243
-
244
- print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
245
- try:
246
- n_ok, n_skip, log_lines = apply_manifest(
247
- ts_in, combined_manifest, ts_applied, author=author, date=tc_date
248
- )
249
- except Exception as e:
250
- errors.append(f'apply_manifest ERROR: {e}')
251
- print(f' ERROR: {e}')
252
- report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
253
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
254
- continue
255
-
256
- for line in log_lines:
257
- print(f' {line}')
258
- for line in log_lines:
259
- if line.strip().startswith('ERROR'):
260
- errors.append(line.strip())
261
- print(f' -> Applied: {n_ok} Skipped: {n_skip}')
262
-
263
- print(' Finalising metadata...')
264
- try:
265
- ts_doc = docx_lib.Document(str(ts_applied))
266
- rev = RevCounter(ts_doc)
267
-
268
- pub_ym, pub_month_year = compute_pub_date()
269
- old_v = version
270
-
271
- title_text = ts_doc.paragraphs[0].text
272
- date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
273
- old_date_str = date_match.group(1) if date_match else ''
274
-
275
- print(f' Version: {old_v} -> {new_v}')
276
- print(f' Publication: {pub_month_year} ({pub_ym})')
277
-
278
- for uid in participating_uids:
279
- try:
280
- meta = extract_cr_metadata(str(cr_paths[uid]))
281
- ch_cells = update_change_history_table(
282
- ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
283
- )
284
- print(f' [Change History] {uid}: {ch_cells}')
285
- except NoChangeHistoryTable:
286
- print(f' [Change History] {uid}: NOT PRESENT β€” this document has no Change History table (History table only)')
287
- except Exception as e:
288
- errors.append(f'[{uid}] Change History ERROR: {e}')
289
- print(f' [Change History] {uid}: ERROR β€” {e}')
290
-
291
- try:
292
- h_cells = update_history_table(
293
- ts_doc, new_v, pub_month_year, rev, author, tc_date
294
- )
295
- print(f' [History] {h_cells}')
296
- except Exception as e:
297
- errors.append(f'History table ERROR: {e}')
298
- print(f' [History] ERROR β€” {e}')
299
-
300
- if old_date_str:
301
- try:
302
- update_title_para(
303
- ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
304
- )
305
- print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
306
- except Exception as e:
307
- errors.append(f'Title update ERROR: {e}')
308
- print(f' [Title] ERROR β€” {e}')
309
- else:
310
- print(f' [Title] SKIP β€” no (YYYY-MM) pattern in: {title_text!r}')
311
-
312
- ts_doc.save(str(ts_final))
313
- print(f' Saved: {spec_compact}/{ts_final.name}')
314
- print(f' Log: {spec_compact}/{log_path.name}')
315
- report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
316
-
317
- except Exception as e:
318
- errors.append(f'Finalisation ERROR: {e}')
319
- print(f' Finalisation ERROR: {e}')
320
- report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
321
-
322
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
323
 
324
  return report, cr_paths, ts_paths, spec_dirs
325
 
@@ -450,144 +485,13 @@ def main():
450
  report = []
451
 
452
  for (spec_number, version), uids in ts_groups.items():
453
- ts_key = f'TS {spec_number} v{version}'
454
  spec_compact = spec_number.replace(' ', '')
455
  spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
456
  spec_dir.mkdir(parents=True, exist_ok=True)
457
-
458
- new_v = derive_new_version(version)
459
- stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
460
- ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
461
- ts_final = spec_dir / f'{stem}.docx'
462
- log_path = spec_dir / f'{stem}.log'
463
- errors = []
464
-
465
- print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
466
-
467
- if (spec_number, version) not in ts_paths:
468
- msg = 'TS DOCX not on disk β€” skipping'
469
- print(f' SKIP: {msg}')
470
- report.append((ts_key, 0, 0, len(uids), None, log_path, [msg]))
471
- continue
472
-
473
- ts_in = ts_paths[(spec_number, version)]
474
-
475
- log_buf = io.StringIO()
476
- tee = _TeeWriter(sys.stdout, log_buf)
477
-
478
- with contextlib.redirect_stdout(tee):
479
- log_header = (
480
- f'Pipeline Log (retry)\n'
481
- f'TS: {spec_number} v{version} -> v{new_v}\n'
482
- f'CRs: {", ".join(uids)}\n'
483
- f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
484
- f'{"=" * 60}\n'
485
- )
486
- print(log_header, end='')
487
-
488
- combined_manifest = []
489
- participating_uids = []
490
-
491
- for uid in uids:
492
- if uid not in cr_paths:
493
- errors.append(f'[{uid}] CR DOCX not found β€” skipped')
494
- continue
495
- print(f' Parsing {uid}... ', end='', flush=True)
496
- try:
497
- changes = parse_cr(cr_paths[uid])
498
- combined_manifest.extend(changes)
499
- participating_uids.append(uid)
500
- print(f'{len(changes)} change(s)')
501
- except Exception as e:
502
- errors.append(f'[{uid}] parse ERROR: {e}')
503
- print(f'ERROR: {e}')
504
-
505
- if not combined_manifest:
506
- print(' No changes parsed β€” skipping apply step.')
507
- report.append((ts_key, 0, 0, len(uids), None, log_path,
508
- errors + ['No changes parsed']))
509
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
510
- continue
511
-
512
- print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
513
- try:
514
- n_ok, n_skip, log_lines = apply_manifest(
515
- ts_in, combined_manifest, ts_applied, author=author, date=tc_date
516
- )
517
- except Exception as e:
518
- errors.append(f'apply_manifest ERROR: {e}')
519
- print(f' ERROR: {e}')
520
- report.append((ts_key, 0, 0, len(uids), None, log_path, errors))
521
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
522
- continue
523
-
524
- for line in log_lines:
525
- print(f' {line}')
526
- for line in log_lines:
527
- if line.strip().startswith('ERROR'):
528
- errors.append(line.strip())
529
- print(f' -> Applied: {n_ok} Skipped: {n_skip}')
530
-
531
- print(' Finalising metadata...')
532
- try:
533
- ts_doc = docx_lib.Document(str(ts_applied))
534
- rev = RevCounter(ts_doc)
535
-
536
- pub_ym, pub_month_year = compute_pub_date()
537
- old_v = version
538
-
539
- title_text = ts_doc.paragraphs[0].text
540
- date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
541
- old_date_str = date_match.group(1) if date_match else ''
542
-
543
- print(f' Version: {old_v} -> {new_v}')
544
- print(f' Publication: {pub_month_year} ({pub_ym})')
545
-
546
- for uid in participating_uids:
547
- try:
548
- meta = extract_cr_metadata(str(cr_paths[uid]))
549
- ch_cells = update_change_history_table(
550
- ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
551
- )
552
- print(f' [Change History] {uid}: {ch_cells}')
553
- except NoChangeHistoryTable:
554
- print(f' [Change History] {uid}: NOT PRESENT β€” this document has no Change History table (History table only)')
555
- except Exception as e:
556
- errors.append(f'[{uid}] Change History ERROR: {e}')
557
- print(f' [Change History] {uid}: ERROR β€” {e}')
558
-
559
- try:
560
- h_cells = update_history_table(
561
- ts_doc, new_v, pub_month_year, rev, author, tc_date
562
- )
563
- print(f' [History] {h_cells}')
564
- except Exception as e:
565
- errors.append(f'History table ERROR: {e}')
566
- print(f' [History] ERROR β€” {e}')
567
-
568
- if old_date_str:
569
- try:
570
- update_title_para(
571
- ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
572
- )
573
- print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
574
- except Exception as e:
575
- errors.append(f'Title update ERROR: {e}')
576
- print(f' [Title] ERROR β€” {e}')
577
- else:
578
- print(f' [Title] SKIP β€” no (YYYY-MM) pattern in: {title_text!r}')
579
-
580
- ts_doc.save(str(ts_final))
581
- print(f' Saved: {spec_compact}/{ts_final.name}')
582
- print(f' Log: {spec_compact}/{log_path.name}')
583
- report.append((ts_key, n_ok, n_skip, len(uids), ts_final, log_path, errors))
584
-
585
- except Exception as e:
586
- errors.append(f'Finalisation ERROR: {e}')
587
- print(f' Finalisation ERROR: {e}')
588
- report.append((ts_key, n_ok, n_skip, len(uids), ts_applied, log_path, errors))
589
-
590
- log_path.write_text(log_buf.getvalue(), encoding='utf-8')
591
 
592
  # Update failed_ts.json β€” remove entries that are now resolved
593
  still_failed = [
@@ -601,11 +505,7 @@ def main():
601
  n_partial = sum(1 for r in report if r[4] is not None and r[6])
602
  n_failed = sum(1 for r in report if r[4] is None)
603
  print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
604
- for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
605
- status_tag = 'OK' if out_path and not errors else ('WARN' if out_path else 'SKIP')
606
- print(f' [{status_tag}] {ts_key}')
607
- for err in errors:
608
- print(f' ! {err}')
609
  return
610
 
611
  # ── TS mode β€” load HF index, skip Steps 1 & 3 ────────────────────────────
@@ -656,12 +556,7 @@ def main():
656
  # Copy the CRs actually applied into the run output dir so the ZIP
657
  # contains exactly the CRs used for this TS (only needed when using
658
  # a shared CR cache that lives outside output_dir).
659
- _run_cr_dir = output_dir / 'CRs'
660
- if cr_dir.resolve() != _run_cr_dir.resolve():
661
- _run_cr_dir.mkdir(parents=True, exist_ok=True)
662
- for _p in cr_paths.values():
663
- if _p.exists():
664
- shutil.copy2(_p, _run_cr_dir / _p.name)
665
 
666
  _section('Final Report (TS mode)')
667
  n_success = sum(1 for r in report if r[4] is not None and not r[6])
@@ -673,16 +568,7 @@ def main():
673
  print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
674
  print()
675
 
676
- for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
677
- status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
678
- print(f' [{status}] {ts_key}')
679
- print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
680
- if out_path:
681
- print(f' Output: {out_path.parent.name}/{out_path.name}')
682
- if log_path and log_path.exists():
683
- print(f' Log: {log_path.parent.name}/{log_path.name}')
684
- for err in errors:
685
- print(f' ! {err}')
686
 
687
  print()
688
  print(f'Output directory: {output_dir}/')
@@ -716,12 +602,7 @@ def main():
716
  # Copy the CRs actually applied into the run output dir so the ZIP
717
  # contains exactly the CRs used for this run (only needed when using
718
  # a shared CR cache that lives outside output_dir).
719
- _run_cr_dir = output_dir / 'CRs'
720
- if cr_dir.resolve() != _run_cr_dir.resolve():
721
- _run_cr_dir.mkdir(parents=True, exist_ok=True)
722
- for _p in cr_paths.values():
723
- if _p.exists():
724
- shutil.copy2(_p, _run_cr_dir / _p.name)
725
 
726
  # ── Final Report ──────────────────────────────────────────────────────────
727
  _section('Final Report')
@@ -735,21 +616,7 @@ def main():
735
  print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
736
  print()
737
 
738
- for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
739
- if out_path and not errors:
740
- status = 'OK'
741
- elif out_path:
742
- status = 'WARN'
743
- else:
744
- status = 'FAIL'
745
- print(f' [{status}] {ts_key}')
746
- print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
747
- if out_path:
748
- print(f' Output: {out_path.parent.name}/{out_path.name}')
749
- if log_path and log_path.exists():
750
- print(f' Log: {log_path.parent.name}/{log_path.name}')
751
- for err in errors:
752
- print(f' ! {err}')
753
 
754
  print()
755
  print(f'Output directory: {output_dir}/')
 
81
  self._real.flush()
82
 
83
 
84
+ # ── Small report / cache helpers ─────────────────────────────────────────────
85
+
86
+ def _print_report(report, *, detailed=True):
87
+ """Print per-TS result lines from a report list."""
88
+ for ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors in report:
89
+ status = 'OK' if out_path and not errors else ('WARN' if out_path else 'FAIL')
90
+ print(f' [{status}] {ts_key}')
91
+ if detailed:
92
+ print(f' CRs: {n_crs} | Body changes applied: {n_ok} | Skipped: {n_skip}')
93
+ if out_path:
94
+ print(f' Output: {out_path.parent.name}/{out_path.name}')
95
+ if log_path and log_path.exists():
96
+ print(f' Log: {log_path.parent.name}/{log_path.name}')
97
+ for err in errors:
98
+ print(f' ! {err}')
99
+
100
+
101
+ def _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir):
102
+ """Copy downloaded CRs into output_dir/CRs when a shared cache is used."""
103
+ run_cr_dir = output_dir / 'CRs'
104
+ if cr_dir.resolve() != run_cr_dir.resolve():
105
+ run_cr_dir.mkdir(parents=True, exist_ok=True)
106
+ for p in cr_paths.values():
107
+ if p.exists():
108
+ shutil.copy2(p, run_cr_dir / p.name)
109
+
110
+
111
+ # ── Per-TS-group apply helper ─────────────────────────────────────────────────
112
+
113
+ def _apply_ts_group(spec_number, version, uids, ts_paths, cr_paths, spec_dir,
114
+ author, tc_date, log_label='Pipeline Log'):
115
+ """Parse, apply, and finalise one TS group. Returns one report tuple."""
116
+ ts_key = f'TS {spec_number} v{version}'
117
+ spec_compact = spec_number.replace(' ', '')
118
+ new_v = derive_new_version(version)
119
+ stem = f'ts_{spec_compact}_v{new_v}_was_v{version}'
120
+ ts_applied = spec_dir / f'ts_{spec_compact}_v{version}_applied.docx'
121
+ ts_final = spec_dir / f'{stem}.docx'
122
+ log_path = spec_dir / f'{stem}.log'
123
+ errors = []
124
+
125
+ print(f'\n-- {ts_key} ({len(uids)} CR(s): {", ".join(uids)}) --')
126
+
127
+ if (spec_number, version) not in ts_paths:
128
+ msg = 'TS download failed β€” skipping'
129
+ print(f' SKIP: {msg}')
130
+ return (ts_key, 0, 0, len(uids), None, log_path, [msg])
131
+
132
+ ts_in = ts_paths[(spec_number, version)]
133
+
134
+ log_buf = io.StringIO()
135
+ tee = _TeeWriter(sys.stdout, log_buf)
136
+
137
+ with contextlib.redirect_stdout(tee):
138
+ log_header = (
139
+ f'{log_label}\n'
140
+ f'TS: {spec_number} v{version} -> v{new_v}\n'
141
+ f'CRs: {", ".join(uids)}\n'
142
+ f'Date: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n'
143
+ f'{"=" * 60}\n'
144
+ )
145
+ print(log_header, end='')
146
+
147
+ combined_manifest = []
148
+ participating_uids = []
149
+
150
+ for uid in uids:
151
+ if uid not in cr_paths:
152
+ errors.append(f'[{uid}] CR download had failed β€” skipped')
153
+ continue
154
+ print(f' Parsing {uid}... ', end='', flush=True)
155
+ try:
156
+ changes = parse_cr(cr_paths[uid])
157
+ combined_manifest.extend(changes)
158
+ participating_uids.append(uid)
159
+ print(f'{len(changes)} change(s)')
160
+ except Exception as e:
161
+ errors.append(f'[{uid}] parse ERROR: {e}')
162
+ print(f'ERROR: {e}')
163
+
164
+ if not combined_manifest:
165
+ print(' No changes parsed β€” skipping apply step.')
166
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
167
+ return (ts_key, 0, 0, len(uids), None, log_path,
168
+ errors + ['No changes parsed'])
169
+
170
+ print(f' Applying {len(combined_manifest)} change(s) to {ts_in.name}...')
171
+ try:
172
+ n_ok, n_skip, log_lines = apply_manifest(
173
+ ts_in, combined_manifest, ts_applied, author=author, date=tc_date
174
+ )
175
+ except Exception as e:
176
+ errors.append(f'apply_manifest ERROR: {e}')
177
+ print(f' ERROR: {e}')
178
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
179
+ return (ts_key, 0, 0, len(uids), None, log_path, errors)
180
+
181
+ for line in log_lines:
182
+ print(f' {line}')
183
+ for line in log_lines:
184
+ if line.strip().startswith('ERROR'):
185
+ errors.append(line.strip())
186
+ print(f' -> Applied: {n_ok} Skipped: {n_skip}')
187
+
188
+ print(' Finalising metadata...')
189
+ ts_final_or_applied = ts_applied # fallback if finalise raises
190
+ try:
191
+ ts_doc = docx_lib.Document(str(ts_applied))
192
+ rev = RevCounter(ts_doc)
193
+
194
+ pub_ym, pub_month_year = compute_pub_date()
195
+ old_v = version
196
+
197
+ title_text = ts_doc.paragraphs[0].text
198
+ date_match = re.search(r'\((\d{4}-\d{2})\)', title_text)
199
+ old_date_str = date_match.group(1) if date_match else ''
200
+
201
+ print(f' Version: {old_v} -> {new_v}')
202
+ print(f' Publication: {pub_month_year} ({pub_ym})')
203
+
204
+ for uid in participating_uids:
205
+ try:
206
+ meta = extract_cr_metadata(str(cr_paths[uid]))
207
+ ch_cells = update_change_history_table(
208
+ ts_doc, meta, pub_ym, old_v, new_v, rev, author, tc_date
209
+ )
210
+ print(f' [Change History] {uid}: {ch_cells}')
211
+ except NoChangeHistoryTable:
212
+ print(f' [Change History] {uid}: NOT PRESENT β€” this document has no Change History table (History table only)')
213
+ except Exception as e:
214
+ errors.append(f'[{uid}] Change History ERROR: {e}')
215
+ print(f' [Change History] {uid}: ERROR β€” {e}')
216
+
217
+ try:
218
+ h_cells = update_history_table(
219
+ ts_doc, new_v, pub_month_year, rev, author, tc_date
220
+ )
221
+ print(f' [History] {h_cells}')
222
+ except Exception as e:
223
+ errors.append(f'History table ERROR: {e}')
224
+ print(f' [History] ERROR β€” {e}')
225
+
226
+ if old_date_str:
227
+ try:
228
+ update_title_para(
229
+ ts_doc, old_v, new_v, old_date_str, pub_ym, rev, author, tc_date
230
+ )
231
+ print(f' [Title] V{old_v} -> V{new_v}, ({old_date_str}) -> ({pub_ym})')
232
+ except Exception as e:
233
+ errors.append(f'Title update ERROR: {e}')
234
+ print(f' [Title] ERROR β€” {e}')
235
+ else:
236
+ print(f' [Title] SKIP β€” no (YYYY-MM) pattern in: {title_text!r}')
237
+
238
+ ts_doc.save(str(ts_final))
239
+ print(f' Saved: {spec_compact}/{ts_final.name}')
240
+ print(f' Log: {spec_compact}/{log_path.name}')
241
+ ts_final_or_applied = ts_final
242
+
243
+ except Exception as e:
244
+ errors.append(f'Finalisation ERROR: {e}')
245
+ print(f' Finalisation ERROR: {e}')
246
+
247
+ log_path.write_text(log_buf.getvalue(), encoding='utf-8')
248
+ return (ts_key, n_ok, n_skip, len(uids), ts_final_or_applied, log_path, errors)
249
+
250
+
251
  # ── Shared Steps 2, 4, 5, 6 ──────────────────────────────────────────────────
252
 
253
  def _run_steps_2_to_6(cr_list, ts_groups, output_dir, cr_dir, ts_dir,
 
349
  report = [] # (ts_key, n_ok, n_skip, n_crs, out_path, log_path, errors)
350
 
351
  for (spec_number, version), uids in ts_groups.items():
 
352
  spec_compact = spec_number.replace(' ', '')
353
  spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
354
  spec_dir.mkdir(parents=True, exist_ok=True)
355
+ report.append(_apply_ts_group(
356
+ spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date
357
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
 
359
  return report, cr_paths, ts_paths, spec_dirs
360
 
 
485
  report = []
486
 
487
  for (spec_number, version), uids in ts_groups.items():
 
488
  spec_compact = spec_number.replace(' ', '')
489
  spec_dir = spec_dirs.get((spec_number, version), ts_dir / spec_compact)
490
  spec_dir.mkdir(parents=True, exist_ok=True)
491
+ report.append(_apply_ts_group(
492
+ spec_number, version, uids, ts_paths, cr_paths, spec_dir, author, tc_date,
493
+ log_label='Pipeline Log (retry)'
494
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  # Update failed_ts.json β€” remove entries that are now resolved
497
  still_failed = [
 
505
  n_partial = sum(1 for r in report if r[4] is not None and r[6])
506
  n_failed = sum(1 for r in report if r[4] is None)
507
  print(f'TSs processed: {n_success} fully OK, {n_partial} with warnings, {n_failed} skipped/failed')
508
+ _print_report(report, detailed=False)
 
 
 
 
509
  return
510
 
511
  # ── TS mode β€” load HF index, skip Steps 1 & 3 ────────────────────────────
 
556
  # Copy the CRs actually applied into the run output dir so the ZIP
557
  # contains exactly the CRs used for this TS (only needed when using
558
  # a shared CR cache that lives outside output_dir).
559
+ _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
 
 
 
 
 
560
 
561
  _section('Final Report (TS mode)')
562
  n_success = sum(1 for r in report if r[4] is not None and not r[6])
 
568
  print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
569
  print()
570
 
571
+ _print_report(report)
 
 
 
 
 
 
 
 
 
572
 
573
  print()
574
  print(f'Output directory: {output_dir}/')
 
602
  # Copy the CRs actually applied into the run output dir so the ZIP
603
  # contains exactly the CRs used for this run (only needed when using
604
  # a shared CR cache that lives outside output_dir).
605
+ _copy_cr_cache_if_needed(cr_paths, cr_dir, output_dir)
 
 
 
 
 
606
 
607
  # ── Final Report ──────────────────────────────────────────────────────────
608
  _section('Final Report')
 
616
  print(f'TSs updated: {n_success} fully OK, {n_partial} with warnings, {n_failed} failed')
617
  print()
618
 
619
+ _print_report(report)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
620
 
621
  print()
622
  print(f'Output directory: {output_dir}/')
scripts/ts_applicator.py CHANGED
@@ -21,6 +21,10 @@ from docx.oxml import OxmlElement
21
  from docx.oxml.ns import qn
22
 
23
  sys.path.insert(0, str(Path(__file__).parent))
 
 
 
 
24
  from docx_helpers import (
25
  RevCounter,
26
  tracked_modify_para,
@@ -32,24 +36,29 @@ from docx_helpers import (
32
 
33
  # ── Text normalisation ────────────────────────────────────────────────────────
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _norm(text):
36
  """Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
37
- return (text
38
- .replace('\xa0', ' ') # non-breaking space
39
- .replace('\u202f', ' ') # narrow no-break space
40
- .replace('\u2007', ' ') # figure space
41
- .replace('\u2060', '') # word joiner (invisible)
42
- .replace('\u200b', '') # zero-width space
43
- .replace('\u00ad', '') # soft hyphen (invisible)
44
- .replace('\u2011', '-') # non-breaking hyphen
45
- .replace('\u2013', '-') # en dash
46
- .replace('\u2014', '-') # em dash
47
- .replace('\u2212', '-') # minus sign
48
- .replace('\u2018', "'") # left single quote
49
- .replace('\u2019', "'") # right single quote
50
- .replace('\u201c', '"') # left double quote
51
- .replace('\u201d', '"') # right double quote
52
- .strip())
53
 
54
 
55
  def _norm_ws(text):
@@ -70,22 +79,9 @@ def _norm_ws(text):
70
  Removing all whitespace from both sides before comparing solves this.
71
  Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
72
  """
73
- base = (text
74
- .replace('\xa0', '')
75
- .replace('\u202f', '')
76
- .replace('\u2007', '')
77
- .replace('\u2060', '')
78
- .replace('\u200b', '')
79
- .replace('\u00ad', '')
80
- .replace('\u2011', '-')
81
- .replace('\u2013', '-')
82
- .replace('\u2014', '-')
83
- .replace('\u2212', '-')
84
- .replace('\u2018', "'")
85
- .replace('\u2019', "'")
86
- .replace('\u201c', '"')
87
- .replace('\u201d', '"'))
88
- return re.sub(r'\s+', '', base)
89
 
90
 
91
  def _norm_alnum(text):
@@ -225,14 +221,14 @@ def _find_table(doc, header_key):
225
  for tbl in doc.tables:
226
  if not tbl.rows:
227
  continue
228
- first_row_texts = [_norm(c.text) for c in tbl.rows[0].cells]
229
- # Match by prefix (header_key may have fewer columns)
230
- match = all(
231
- i < len(first_row_texts) and norm_key[i] in first_row_texts[i]
232
- for i in range(len(norm_key))
233
- )
234
- if match:
235
- return tbl, 1.0
236
 
237
  return None, 0.0
238
 
@@ -453,6 +449,20 @@ def _apply_section_replace(doc, change, rev, author, date, log):
453
  break
454
  else:
455
  break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
  # ── Clone and remap IDs on the CR elements ─────────────────────────────────
458
  cloned = []
@@ -522,7 +532,10 @@ def _apply_text_replace(doc, change, rev, author, date, log):
522
  for para in cell.paragraphs:
523
  if old in para.text:
524
  tracked_modify_para(para, old, new, rev, author, date)
525
- log.append(f" OK text_replace (table_cell row={row_idx} col={col_idx}): {old!r} β†’ {new!r}")
 
 
 
526
  return True
527
  log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
528
  return False
@@ -546,7 +559,11 @@ def _apply_text_replace(doc, change, rev, author, date, log):
546
  tracked_modify_para(para, old, new, rev, author, date)
547
  log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β†’ {new!r}")
548
  return True
549
- # Final fallback: scan ALL columns of ALL tables
 
 
 
 
550
  _all_start = tbl_by_section if tbl_by_section is not None else tbl
551
  for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
552
  for r_idx, row in enumerate(search_tbl.rows):
@@ -554,7 +571,9 @@ def _apply_text_replace(doc, change, rev, author, date, log):
554
  for para in cell.paragraphs:
555
  if old in para.text:
556
  tracked_modify_para(para, old, new, rev, author, date)
557
- log.append(f" OK text_replace (table_cell any_col row={r_idx} col={c_idx}): {old!r} β†’ {new!r}")
 
 
558
  return True
559
  log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
560
  return False
@@ -606,6 +625,7 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
606
  tbl_by_section, _ = _find_table_by_section(doc, section_heading)
607
  if tbl_by_section is not None:
608
  tbl = tbl_by_section
 
609
  else:
610
  tbl, t_conf = _find_table(doc, loc['table_header'])
611
  if tbl is None:
@@ -636,7 +656,9 @@ def _apply_row_insert(doc, change, rev, author, date, log, last_inserted=None):
636
  last_inserted[key] = new_tr
637
 
638
  desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
639
- log.append(f" OK row_insert after row[{row_idx}] ({after_anchor!r}): {desc!r}")
 
 
640
  return True
641
 
642
 
 
21
  from docx.oxml.ns import qn
22
 
23
  sys.path.insert(0, str(Path(__file__).parent))
24
+
25
+ _MIN_LEN_ALLCOL_FALLBACK = 8 # old text shorter than this is too ambiguous for any-column search
26
+ _WARN_CONF = 0.8 # confidence below this emits WARN instead of OK
27
+
28
  from docx_helpers import (
29
  RevCounter,
30
  tracked_modify_para,
 
36
 
37
  # ── Text normalisation ────────────────────────────────────────────────────────
38
 
39
+ _UNICODE_REPLACEMENTS = (
40
+ ('\xa0', ' '), # non-breaking space
41
+ ('\u202f', ' '), # narrow no-break space
42
+ ('\u2007', ' '), # figure space
43
+ ('\u2060', ''), # word joiner (invisible)
44
+ ('\u200b', ''), # zero-width space
45
+ ('\u00ad', ''), # soft hyphen (invisible)
46
+ ('\u2011', '-'), # non-breaking hyphen
47
+ ('\u2013', '-'), # en dash
48
+ ('\u2014', '-'), # em dash
49
+ ('\u2212', '-'), # minus sign
50
+ ('\u2018', "'"), # left single quote
51
+ ('\u2019', "'"), # right single quote
52
+ ('\u201c', '"'), # left double quote
53
+ ('\u201d', '"'), # right double quote
54
+ )
55
+
56
+
57
  def _norm(text):
58
  """Normalise common Unicode invisible/whitespace/punctuation variants for comparison."""
59
+ for old, new in _UNICODE_REPLACEMENTS:
60
+ text = text.replace(old, new)
61
+ return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  def _norm_ws(text):
 
79
  Removing all whitespace from both sides before comparing solves this.
80
  Used as a third-level fallback (confidence 0.8) after exact and NBSP-norm.
81
  """
82
+ for old, new in _UNICODE_REPLACEMENTS:
83
+ text = text.replace(old, new)
84
+ return re.sub(r'\s+', '', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  def _norm_alnum(text):
 
221
  for tbl in doc.tables:
222
  if not tbl.rows:
223
  continue
224
+ for row in tbl.rows[:3]: # check first 3 rows β€” header may not be row 0
225
+ row_texts = [_norm(c.text) for c in row.cells]
226
+ match = all(
227
+ i < len(row_texts) and norm_key[i] in row_texts[i]
228
+ for i in range(len(norm_key))
229
+ )
230
+ if match:
231
+ return tbl, 1.0
232
 
233
  return None, 0.0
234
 
 
449
  break
450
  else:
451
  break
452
+ # Validate the candidate table matches what the CR says should be deleted
453
+ if ts_tbl_elem is not None and elements_xml:
454
+ cr_tbl_xmls = [x for x in elements_xml if '<w:tbl' in x]
455
+ if cr_tbl_xmls:
456
+ from lxml import etree as _etree
457
+ cr_tbl_el = _etree.fromstring(cr_tbl_xmls[0].encode())
458
+ cr_hdr = ''.join(t.text or '' for t in
459
+ cr_tbl_el.findall('.//' + qn('w:t'))[:10]).lower()
460
+ ts_hdr = ''.join(t.text or '' for t in
461
+ ts_tbl_elem.findall('.//' + qn('w:t'))[:10]).lower()
462
+ if cr_hdr and cr_hdr not in ts_hdr and ts_hdr not in cr_hdr:
463
+ log.append(' WARN section_replace: candidate table header mismatch'
464
+ ' β€” skipping table removal')
465
+ ts_tbl_elem = None
466
 
467
  # ── Clone and remap IDs on the CR elements ─────────────────────────────────
468
  cloned = []
 
532
  for para in cell.paragraphs:
533
  if old in para.text:
534
  tracked_modify_para(para, old, new, rev, author, date)
535
+ _pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK '
536
+ log.append(f" {_pfx} text_replace (table_cell"
537
+ f" t_conf={t_conf:.1f} r_conf={r_conf:.1f}"
538
+ f" row={row_idx} col={col_idx}): {old!r} β†’ {new!r}")
539
  return True
540
  log.append(f" ERROR text_replace: old text {old!r} not in cell (row={row_idx} col={col_idx})")
541
  return False
 
559
  tracked_modify_para(para, old, new, rev, author, date)
560
  log.append(f" OK text_replace (table_cell scan row={r_idx} col={col_idx}): {old!r} β†’ {new!r}")
561
  return True
562
+ # Final fallback: scan ALL columns of ALL tables (guarded by min length)
563
+ if len(old) < _MIN_LEN_ALLCOL_FALLBACK:
564
+ log.append(f" ERROR text_replace: {old!r} too short for all-column fallback"
565
+ f" (ambiguous β€” skipped)")
566
+ return False
567
  _all_start = tbl_by_section if tbl_by_section is not None else tbl
568
  for search_tbl in [_all_start] + [t for t in doc.tables if t is not _all_start]:
569
  for r_idx, row in enumerate(search_tbl.rows):
 
571
  for para in cell.paragraphs:
572
  if old in para.text:
573
  tracked_modify_para(para, old, new, rev, author, date)
574
+ log.append(f" WARN text_replace (table_cell any_col"
575
+ f" row={r_idx} col={c_idx} β€” low confidence):"
576
+ f" {old!r} β†’ {new!r}")
577
  return True
578
  log.append(f" ERROR text_replace: old text {old!r} not found in any table column")
579
  return False
 
625
  tbl_by_section, _ = _find_table_by_section(doc, section_heading)
626
  if tbl_by_section is not None:
627
  tbl = tbl_by_section
628
+ t_conf = 1.0
629
  else:
630
  tbl, t_conf = _find_table(doc, loc['table_header'])
631
  if tbl is None:
 
656
  last_inserted[key] = new_tr
657
 
658
  desc = cells_data[1]['text'] if len(cells_data) > 1 else '?'
659
+ _pfx = 'WARN' if min(t_conf, r_conf) < _WARN_CONF else 'OK '
660
+ log.append(f" {_pfx} row_insert (t_conf={t_conf:.1f} r_conf={r_conf:.1f})"
661
+ f" after row[{row_idx}] ({after_anchor!r}): {desc!r}")
662
  return True
663
 
664