Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Jan 5

Commit

e293c42

verified ·

1 Parent(s): 46344cb

Update main.py

Browse files

Files changed (1) hide show

main.py +70 -76

main.py CHANGED Viewed

@@ -105,6 +105,47 @@ DOMAIN_STATUS_TERMS = (
     "asn", "grn", "pick", "picking"
 )
 def _is_domain_status_context(msg_norm: str) -> bool:
     if "status locked" in msg_norm or "locked status" in msg_norm:
         return True
@@ -161,80 +202,45 @@ def _ensure_numbering(text: str) -> str:
         out.append(f"{marker} {seg}")
     return "\n".join(out)
-def _filter_error_lines_by_query(text: str, query: str, max_lines: int = 4) -> str:
     """
-    Pick the most relevant 'Common Errors & Resolution' bullets for the user's message.
-    Generic across SOPs via error families + phrase overlap.
-    Prioritization:
       1) error-family match (NOT_FOUND/MISMATCH/LOCKED/PERMISSION/TIMEOUT/SYNC),
-      2) anchored starts (line begins with the error phrase/heading),
       3) multi-word overlap (bigrams/trigrams),
       4) token overlap,
-      5) bullet/heading formatting bonus.
-    If no line matches positively, falls back to the first few lines.
     """
     import re
     from typing import List, Tuple
-    # --- Generic error families (SOP-wide) ---
-    ERROR_FAMILIES = {
-        "NOT_FOUND": (
-            "not found", "missing", "does not exist", "doesn't exist",
-            "unavailable", "not available", "cannot find", "no such", "not present", "absent"
-        ),
-        "MISMATCH": (
-            "mismatch", "doesn't match", "does not match", "variance",
-            "difference", "discrepancy", "not equal"
-        ),
-        "LOCKED": (
-            "locked", "status locked", "blocked", "read only", "read-only", "frozen", "freeze"
-        ),
-        "PERMISSION": (
-            "permission", "permissions", "access denied", "not authorized",
-            "not authorised", "insufficient privileges", "no access", "authorization", "authorisation"
-        ),
-        "TIMEOUT": (
-            "timeout", "timed out", "network", "connection", "unable to connect",
-            "disconnected", "no network"
-        ),
-        "SYNC": (
-            "sync", "synchronization", "synchronisation", "replication",
-            "refresh", "out of sync", "stale", "delay", "lag"
-        ),
-    }
-    # Normalizer
     def _norm(s: str) -> str:
         s = (s or "").lower()
         s = re.sub(r"[^\w\s]", " ", s)
         s = re.sub(r"\s+", " ", s).strip()
         return s
-    # Detect error families mentioned in a string
-    def _families_for(s: str) -> List[str]:
-        out = []
-        low = _norm(s)
-        for fam, syns in ERROR_FAMILIES.items():
-            if any(k in low for k in syns):
-                out.append(fam)
-        return out
-    # N-grams
     def _ngrams(tokens: List[str], n: int) -> List[str]:
         return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
-    # Normalize query
     q = _norm(query)
     q_tokens = [t for t in q.split() if len(t) > 1]
     q_bi = _ngrams(q_tokens, 2)
     q_tri = _ngrams(q_tokens, 3)
-    q_families = set(_families_for(query))
-    # Candidate lines
     lines = _normalize_lines(text)
     if not lines:
         return (text or "").strip()
@@ -242,53 +248,41 @@ def _filter_error_lines_by_query(text: str, query: str, max_lines: int = 4) -> s
     scored: List[Tuple[float, str]] = []
     for ln in lines:
         ln_norm = _norm(ln)
-        ln_families = set(_families_for(ln))
-        # --- Signals ---
-        # Family match (strong): any overlap between query families and line families
-        fam_overlap = len(q_families & ln_families)
-        fam_score = 1.60 * fam_overlap  # strong boost when families line up
-        # Exact phrase (medium-strong)
-        exact_phrase = 1.00 if (q and q in ln_norm) else 0.0
-        # Anchored start (strong for bullet headings like "ASN not found: ...")
         first2 = " ".join(q_tokens[:2]) if len(q_tokens) >= 2 else ""
         first3 = " ".join(q_tokens[:3]) if len(q_tokens) >= 3 else ""
-        anchored = 1.00 if (first3 and ln_norm.startswith(first3)) or (first2 and ln_norm.startswith(first2)) else 0.0
-        # Multi-word phrase overlap
         bigram_hits = sum(1 for bg in q_bi if bg and bg in ln_norm)
         trigram_hits = sum(1 for tg in q_tri if tg and tg in ln_norm)
-        # Token overlap (fallback)
         token_overlap = sum(1 for t in q_tokens if t and t in ln_norm)
-        # --- Score composition (tuned for generic SOPs) ---
         score = (
-            fam_score +
-            0.90 * anchored +
             0.80 * trigram_hits +
             0.55 * bigram_hits +
-            0.45 * exact_phrase +
             0.30 * token_overlap
         )
-        # Small bonuses for bullets/heading-like lines
-        if re.match(r"^\s*[\-\*\u2022]\s*", ln):  # bullet dot
             score += 0.10
-        # Heading before ':' matches some part of the query
         heading = ln_norm.split(":")[0].strip()
         if heading and (heading in q or (first2 and first2 in heading)):
             score += 0.15
         scored.append((score, ln))
-    # Sort by score desc and take top max_lines
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [ln for s, ln in scored[:max_lines] if s > 0.0]
-    # Fallback if everything scored zero
     if not top:
         top = lines[:max_lines]
@@ -863,8 +857,8 @@ async def chat_with_ai(input_data: ChatInput):
         # Bypass gate when strong steps signals are present for Receiving module
         strong_steps_bypass = looks_like_steps_query and looks_like_receiving
-        if (weak_domain_only or (low_context_hit and not combined_ok)) and not strong_steps_bypass:
             return {
                 "bot_response": _build_clarifying_message(),
                 "status": "NO_KB_MATCH",
@@ -909,7 +903,7 @@ async def chat_with_ai(input_data: ChatInput):
                     if is_perm_query:
                         context = _filter_permission_lines(ctx_err, max_lines=6)
                     else:
-                        context = _filter_error_lines_by_query(ctx_err, input_data.user_message, max_lines=6)
                     escalation_line = _extract_escalation_line(full_errors)
             elif detected_intent == "prereqs":

     "asn", "grn", "pick", "picking"
 )
+# --- Generic error families (SOP-wide, reusable in gating and line selection) ---
+ERROR_FAMILY_SYNS = {
+    "NOT_FOUND": (
+        "not found", "missing", "does not exist", "doesn't exist",
+        "unavailable", "not available", "cannot find", "no such",
+        "not present", "absent"
+    ),
+    "MISMATCH": (
+        "mismatch", "doesn't match", "does not match", "variance",
+        "difference", "discrepancy", "not equal"
+    ),
+    "LOCKED": (
+        "locked", "status locked", "blocked", "read only", "read-only", "frozen", "freeze"
+    ),
+    "PERMISSION": (
+        "permission", "permissions", "access denied", "not authorized",
+        "not authorised", "insufficient privileges", "no access",
+        "authorization", "authorisation"
+    ),
+    "TIMEOUT": (
+        "timeout", "timed out", "network", "connection",
+        "unable to connect", "disconnected", "no network"
+    ),
+    "SYNC": (
+        "sync", "synchronization", "synchronisation", "replication",
+        "refresh", "out of sync", "stale", "delay", "lag"
+    ),
+}
+def _detect_error_families(msg: str) -> list:
+    """Return matching error family names found in the message (generic across SOPs)."""
+    low = (msg or "").lower()
+    import re
+    low_norm = re.sub(r"[^\w\s]", " ", low)
+    low_norm = re.sub(r"\s+", " ", low_norm).strip()
+    fams = []
+    for fam, syns in ERROR_FAMILY_SYNS.items():
+        if any(s in low_norm for s in syns):
+            fams.append(fam)
+    return fams
 def _is_domain_status_context(msg_norm: str) -> bool:
     if "status locked" in msg_norm or "locked status" in msg_norm:
         return True
         out.append(f"{marker} {seg}")
     return "\n".join(out)
+def _filter_error_lines_by_query(text: str, query: str, max_lines: int = 1) -> str:
     """
+    Pick the most relevant 'Common Errors & Resolution' bullet(s) for the user's message.
+    Generic (SOP-agnostic) scoring:
       1) error-family match (NOT_FOUND/MISMATCH/LOCKED/PERMISSION/TIMEOUT/SYNC),
+      2) anchored starts (line begins with error heading),
       3) multi-word overlap (bigrams/trigrams),
       4) token overlap,
+      5) formatting bonus for bullets/headings.
+    Returns exactly `max_lines` best-scoring lines (defaults to 1).
     """
     import re
     from typing import List, Tuple
     def _norm(s: str) -> str:
         s = (s or "").lower()
         s = re.sub(r"[^\w\s]", " ", s)
         s = re.sub(r"\s+", " ", s).strip()
         return s
     def _ngrams(tokens: List[str], n: int) -> List[str]:
         return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
+    def _families_for(s: str) -> set:
+        low = _norm(s)
+        fams = set()
+        for fam, syns in ERROR_FAMILY_SYNS.items():
+            if any(k in low for k in syns):
+                fams.add(fam)
+        return fams
     q = _norm(query)
     q_tokens = [t for t in q.split() if len(t) > 1]
     q_bi = _ngrams(q_tokens, 2)
     q_tri = _ngrams(q_tokens, 3)
+    q_fams = _families_for(query)
     lines = _normalize_lines(text)
     if not lines:
         return (text or "").strip()
     scored: List[Tuple[float, str]] = []
     for ln in lines:
         ln_norm = _norm(ln)
+        ln_fams = _families_for(ln)
+        fam_overlap = len(q_fams & ln_fams)         # strong signal
+        anchored = 0.0
         first2 = " ".join(q_tokens[:2]) if len(q_tokens) >= 2 else ""
         first3 = " ".join(q_tokens[:3]) if len(q_tokens) >= 3 else ""
+        if (first3 and ln_norm.startswith(first3)) or (first2 and ln_norm.startswith(first2)):
+            anchored = 1.0
         bigram_hits = sum(1 for bg in q_bi if bg and bg in ln_norm)
         trigram_hits = sum(1 for tg in q_tri if tg and tg in ln_norm)
         token_overlap = sum(1 for t in q_tokens if t and t in ln_norm)
+        exact_phrase = 1.0 if (q and q in ln_norm) else 0.0
+        # Composite score (tuned generically)
         score = (
+            1.70 * fam_overlap +
+            1.00 * anchored +
             0.80 * trigram_hits +
             0.55 * bigram_hits +
+            0.40 * exact_phrase +
             0.30 * token_overlap
         )
+        if re.match(r"^\s*[\-\*\u2022]\s*", ln):  # bullet
             score += 0.10
         heading = ln_norm.split(":")[0].strip()
         if heading and (heading in q or (first2 and first2 in heading)):
             score += 0.15
         scored.append((score, ln))
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [ln for s, ln in scored[:max_lines] if s > 0.0]
     if not top:
         top = lines[:max_lines]
         # Bypass gate when strong steps signals are present for Receiving module
         strong_steps_bypass = looks_like_steps_query and looks_like_receiving
+        strong_error_signal = len(_detect_error_families(msg_low)) > 0
+        if (weak_domain_only or (low_context_hit and not combined_ok)) and not strong_steps_bypass and not strong_error_signal:
             return {
                 "bot_response": _build_clarifying_message(),
                 "status": "NO_KB_MATCH",
                     if is_perm_query:
                         context = _filter_permission_lines(ctx_err, max_lines=6)
                     else:
+                        context = _filter_error_lines_by_query(ctx_err, input_data.user_message, max_lines=1)
                     escalation_line = _extract_escalation_line(full_errors)
             elif detected_intent == "prereqs":