srilakshu012456 commited on
Commit
16cbfa8
·
verified ·
1 Parent(s): ea5f821

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +53 -4
main.py CHANGED
@@ -336,21 +336,63 @@ def _build_doc_section_index(best_doc: str) -> Dict[str, Optional[str]]:
336
 
337
  def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_action: Optional[str]) -> str:
338
  """
339
- Stop when we hit any known section heading in the same doc.
340
- If current_action is set, stop only when the next heading belongs to a different action_tag.
341
- Fully generic: no hard-coded SOP words.
 
 
342
  """
343
  if not (section_text or "").strip():
344
  return section_text
345
 
346
- index = _build_doc_section_index(best_doc)
 
 
 
 
 
 
 
 
 
347
  known_headings = set(index.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  lines = [ln for ln in (section_text or "").splitlines() if ln.strip()]
349
  out: List[str] = []
350
 
351
  for ln in lines:
352
  low = ln.lower().strip()
353
 
 
354
  matched_heading = None
355
  for h in known_headings:
356
  if h in low:
@@ -362,6 +404,13 @@ def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_acti
362
  if (current_action and next_action and next_action != current_action) or (current_action is None):
363
  break
364
 
 
 
 
 
 
 
 
365
  out.append(ln)
366
 
367
  return "\n".join(out).strip()
 
336
 
337
  def _cut_at_next_boundary_generic(section_text: str, best_doc: str, current_action: Optional[str]) -> str:
338
  """
339
+ Stop when we hit:
340
+ 1) Any known section heading (from KB metadata) that belongs to a *different* action_tag, OR
341
+ 2) Any heading-like OR numbered line that contains generic action keywords for a *different* action family.
342
+
343
+ Generic & future-proof: no SOP-specific terms.
344
  """
345
  if not (section_text or "").strip():
346
  return section_text
347
 
348
+ # Build metadata-based index: {lower(section_title): lower(action_tag or None)}
349
+ index: Dict[str, Optional[str]] = {}
350
+ for d in bm25_docs:
351
+ m = d.get("meta", {}) or {}
352
+ if m.get("filename") == best_doc and m.get("intent_tag") == "steps":
353
+ sec = (m.get("section") or "").strip()
354
+ tag = (m.get("action_tag") or "").strip().lower() or None
355
+ if sec:
356
+ index[sec.lower()] = tag
357
+
358
  known_headings = set(index.keys())
359
+
360
+ # Generic action families (no SOP-specific words)
361
+ ACTION_FAMILIES = {
362
+ "create": ("create", "creation", "new"),
363
+ "update": ("update", "updation", "reschedule", "edit", "modify", "change"),
364
+ "delete": ("delete", "deletion", "cancel", "remove", "void"),
365
+ }
366
+
367
+ STEP_PREFIX_RX = re.compile(r"^\s*(?:[\u2460-\u2473]|\d+\s*[.)]|[-*•])")
368
+
369
+ def detect_action_family_in_line(line_low: str) -> Optional[str]:
370
+ for fam, toks in ACTION_FAMILIES.items():
371
+ for t in toks:
372
+ if re.search(rf"\b{re.escape(t)}\b", line_low, flags=re.IGNORECASE):
373
+ return fam
374
+ return None
375
+
376
+ def is_heading_like(raw_line: str, line_low: str) -> bool:
377
+ if (":" in raw_line) or ("–" in raw_line) or re.match(r"^\s*[-–]\s*", raw_line):
378
+ return True
379
+ if any(h in line_low for h in known_headings):
380
+ return True
381
+ # Simple title-style heuristic
382
+ if len(raw_line.strip()) <= 140:
383
+ words = re.findall(r"[A-Za-z][A-Za-z]+", raw_line)
384
+ cap_ratio = sum(1 for w in words if (w[0].isupper() or w.isupper())) / (len(words) or 1)
385
+ if cap_ratio >= 0.40:
386
+ return True
387
+ return False
388
+
389
  lines = [ln for ln in (section_text or "").splitlines() if ln.strip()]
390
  out: List[str] = []
391
 
392
  for ln in lines:
393
  low = ln.lower().strip()
394
 
395
+ # 1) Metadata heading boundary (best case)
396
  matched_heading = None
397
  for h in known_headings:
398
  if h in low:
 
404
  if (current_action and next_action and next_action != current_action) or (current_action is None):
405
  break
406
 
407
+ # 2) Generic action boundary (works even if visible text != metadata title)
408
+ fam = detect_action_family_in_line(low)
409
+ if current_action and fam and fam != current_action:
410
+ # treat heading-like OR numbered lines as boundaries
411
+ if is_heading_like(ln, low) or STEP_PREFIX_RX.match(ln):
412
+ break
413
+
414
  out.append(ln)
415
 
416
  return "\n".join(out).strip()