Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -336,21 +336,63 @@ def _build_doc_section_index(best_doc: str) -> Dict[str, Optional[str]]:
|
|
| 336 |
|
| 337 |
def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_action: Optional[str]) -> str:
|
| 338 |
"""
|
| 339 |
-
Stop when we hit
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
| 342 |
"""
|
| 343 |
if not (section_text or "").strip():
|
| 344 |
return section_text
|
| 345 |
|
| 346 |
-
index
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
known_headings = set(index.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
lines = [ln for ln in (section_text or "").splitlines() if ln.strip()]
|
| 349 |
out: List[str] = []
|
| 350 |
|
| 351 |
for ln in lines:
|
| 352 |
low = ln.lower().strip()
|
| 353 |
|
|
|
|
| 354 |
matched_heading = None
|
| 355 |
for h in known_headings:
|
| 356 |
if h in low:
|
|
@@ -362,6 +404,13 @@ def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_acti
|
|
| 362 |
if (current_action and next_action and next_action != current_action) or (current_action is None):
|
| 363 |
break
|
| 364 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
out.append(ln)
|
| 366 |
|
| 367 |
return "\n".join(out).strip()
|
|
|
|
| 336 |
|
| 337 |
def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_action: Optional[str]) -> str:
|
| 338 |
"""
|
| 339 |
+
Stop when we hit:
|
| 340 |
+
1) Any known section heading (from KB metadata) that belongs to a *different* action_tag, OR
|
| 341 |
+
2) Any heading-like OR numbered line that contains generic action keywords for a *different* action family.
|
| 342 |
+
|
| 343 |
+
Generic & future-proof: no SOP-specific terms.
|
| 344 |
"""
|
| 345 |
if not (section_text or "").strip():
|
| 346 |
return section_text
|
| 347 |
|
| 348 |
+
# Build metadata-based index: {lower(section_title): lower(action_tag or None)}
|
| 349 |
+
index: Dict[str, Optional[str]] = {}
|
| 350 |
+
for d in bm25_docs:
|
| 351 |
+
m = d.get("meta", {}) or {}
|
| 352 |
+
if m.get("filename") == best_doc and m.get("intent_tag") == "steps":
|
| 353 |
+
sec = (m.get("section") or "").strip()
|
| 354 |
+
tag = (m.get("action_tag") or "").strip().lower() or None
|
| 355 |
+
if sec:
|
| 356 |
+
index[sec.lower()] = tag
|
| 357 |
+
|
| 358 |
known_headings = set(index.keys())
|
| 359 |
+
|
| 360 |
+
# Generic action families (no SOP-specific words)
|
| 361 |
+
ACTION_FAMILIES = {
|
| 362 |
+
"create": ("create", "creation", "new"),
|
| 363 |
+
"update": ("update", "updation", "reschedule", "edit", "modify", "change"),
|
| 364 |
+
"delete": ("delete", "deletion", "cancel", "remove", "void"),
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
STEP_PREFIX_RX = re.compile(r"^\s*(?:[\u2460-\u2473]|\d+\s*[.)]|[-*•])")
|
| 368 |
+
|
| 369 |
+
def detect_action_family_in_line(line_low: str) -> Optional[str]:
|
| 370 |
+
for fam, toks in ACTION_FAMILIES.items():
|
| 371 |
+
for t in toks:
|
| 372 |
+
if re.search(rf"\b{re.escape(t)}\b", line_low, flags=re.IGNORECASE):
|
| 373 |
+
return fam
|
| 374 |
+
return None
|
| 375 |
+
|
| 376 |
+
def is_heading_like(raw_line: str, line_low: str) -> bool:
|
| 377 |
+
if (":" in raw_line) or ("–" in raw_line) or re.match(r"^\s*[-–]\s*", raw_line):
|
| 378 |
+
return True
|
| 379 |
+
if any(h in line_low for h in known_headings):
|
| 380 |
+
return True
|
| 381 |
+
# Simple title-style heuristic
|
| 382 |
+
if len(raw_line.strip()) <= 140:
|
| 383 |
+
words = re.findall(r"[A-Za-z][A-Za-z]+", raw_line)
|
| 384 |
+
cap_ratio = sum(1 for w in words if (w[0].isupper() or w.isupper())) / (len(words) or 1)
|
| 385 |
+
if cap_ratio >= 0.40:
|
| 386 |
+
return True
|
| 387 |
+
return False
|
| 388 |
+
|
| 389 |
lines = [ln for ln in (section_text or "").splitlines() if ln.strip()]
|
| 390 |
out: List[str] = []
|
| 391 |
|
| 392 |
for ln in lines:
|
| 393 |
low = ln.lower().strip()
|
| 394 |
|
| 395 |
+
# 1) Metadata heading boundary (best case)
|
| 396 |
matched_heading = None
|
| 397 |
for h in known_headings:
|
| 398 |
if h in low:
|
|
|
|
| 404 |
if (current_action and next_action and next_action != current_action) or (current_action is None):
|
| 405 |
break
|
| 406 |
|
| 407 |
+
# 2) Generic action boundary (works even if visible text != metadata title)
|
| 408 |
+
fam = detect_action_family_in_line(low)
|
| 409 |
+
if current_action and fam and fam != current_action:
|
| 410 |
+
# treat heading-like OR numbered lines as boundaries
|
| 411 |
+
if is_heading_like(ln, low) or STEP_PREFIX_RX.match(ln):
|
| 412 |
+
break
|
| 413 |
+
|
| 414 |
out.append(ln)
|
| 415 |
|
| 416 |
return "\n".join(out).strip()
|