Spaces:

Nexari-Research
/

Nexari-Server

Sleeping

App Files Files Community

Nexari-Research commited on Dec 1, 2025

Commit

81473e3

verified ·

1 Parent(s): 49afcdd

Update tools_engine.py

Browse files

Files changed (1) hide show

tools_engine.py +36 -93

tools_engine.py CHANGED Viewed

@@ -1,23 +1,14 @@
 """
-tools_engine.py - Structured web search + page extraction + canonical intent detection
-- Uses duckduckgo_search (DDGS) to get URLs
-- Fetches pages (requests + BeautifulSoup) to extract readable snippets
-- Returns: {"query": "...", "results": [{"title","snippet","url"}, ...]}
 """
 from duckduckgo_search import DDGS
 from transformers import pipeline
-import requests
-from bs4 import BeautifulSoup
 import re
-import time
 print(">>> Tools: Loading Intent Classification Model...")
-try:
-    intent_classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
-except Exception as e:
-    print(f"Warning: intent classifier failed to load: {e}")
-    intent_classifier = None
 def analyze_intent(user_text):
     if not user_text:
@@ -25,105 +16,57 @@ def analyze_intent(user_text):
     text_lower = user_text.lower().strip()
     direct_chat_triggers = [
         "hi","hello","hey","hlo","namaste",
-        "what is your name","who are you","your name"
     ]
-    if text_lower in direct_chat_triggers or any(text_lower.startswith(t+" ") for t in direct_chat_triggers):
         return "general"
     candidate_labels = ["internet search","general conversation","coding request","checking time"]
     try:
-        if intent_classifier:
-            res = intent_classifier(user_text, candidate_labels)
-            top = res['labels'][0]
-            score = res['scores'][0]
-            mapping = {
-                "internet search": "internet_search",
-                "general conversation": "general",
-                "coding request": "coding_request",
-                "checking time": "checking_time"
-            }
-            if score > 0.45:
-                return mapping.get(top, "general")
     except Exception:
         pass
     return "general"
-def fetch_snippet_from_url(url, max_chars=320, timeout=6):
     """
-    Fetch page HTML and extract readable snippet using heuristics.
     """
     try:
-        headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) NexariBot/1.0"}
-        r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
-        if r.status_code != 200 or not r.text:
-            return ""
-        soup = BeautifulSoup(r.text, "html.parser")
-        for s in soup(["script", "style", "noscript", "header", "footer", "form", "nav", "aside"]):
-            s.extract()
-        article = soup.find("article")
-        main = soup.find("main")
-        body_text = ""
-        if article:
-            body_text = article.get_text(separator=" ", strip=True)
-        elif main:
-            body_text = main.get_text(separator=" ", strip=True)
-        else:
-            # gather longest paragraphs
-            texts = [t.get_text(" ", strip=True) for t in soup.find_all(["p","div","span"])]
-            texts = [t for t in texts if len(t) > 40]
-            texts = sorted(texts, key=len, reverse=True)
-            body_text = " ".join(texts[:3]) if texts else soup.get_text(separator=" ", strip=True)
-        body_text = re.sub(r'\s+', ' ', (body_text or "")).strip()
-        if not body_text:
-            return ""
-        if len(body_text) <= max_chars:
-            return body_text
-        # try to cut at sentence boundary
-        chunk = body_text[:max_chars+60]
-        last_period = max(chunk.rfind('. '), chunk.rfind('! '), chunk.rfind('? '))
-        if last_period > int(max_chars*0.2):
-            snippet = chunk[:last_period+1]
-        else:
-            snippet = body_text[:max_chars].rsplit(' ', 1)[0] + "..."
-        return snippet
-    except Exception:
-        return ""
-def perform_web_search(user_text, max_results=3):
-    """
-    Return structured results.
-    """
-    try:
-        query = (user_text or "").strip()
-        if not query:
-            return {"query": "", "results": []}
-        # sanitize
-        removals = ["search for", "find", "google", "lookup", "look up", "what is", "tell me about"]
         q = query.lower()
-        for r in removals:
-            q = q.replace(r, "")
         q = q.strip() or query
         results = list(DDGS().text(q, max_results=max_results))
         structured = {"query": q, "results": []}
-        if not results:
-            return structured
-        for r in results[:max_results]:
-            title = (r.get("title") or "").strip()
-            ddg_body = (r.get("body") or r.get("snippet") or "").strip()
             url = r.get("href") or r.get("url") or r.get("link") or ""
-            snippet = ddg_body
-            if (not snippet or len(snippet) < 80) and url:
-                fetched = fetch_snippet_from_url(url, max_chars=320)
-                if fetched:
-                    snippet = fetched
-            # fallback truncate
-            snippet = re.sub(r'\s+', ' ', (snippet or ""))[:320].strip()
-            structured["results"].append({"title": title or url, "snippet": snippet, "url": url})
-            time.sleep(0.18)  # polite delay
         return structured
     except Exception as e:
         print(f"Search error: {e}")

 """
+tools_engine.py - Improved perform_web_search to return structured results with URLs and snippets,
+and canonical intent detection unchanged.
 """
 from duckduckgo_search import DDGS
 from transformers import pipeline
 import re
 print(">>> Tools: Loading Intent Classification Model...")
+intent_classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
 def analyze_intent(user_text):
     if not user_text:
     text_lower = user_text.lower().strip()
     direct_chat_triggers = [
         "hi","hello","hey","hlo","namaste",
+        "what is your name", "who are you", "your name"
     ]
+    if text_lower in direct_chat_triggers or any(text_lower.startswith(t + " ") for t in direct_chat_triggers):
         return "general"
     candidate_labels = ["internet search","general conversation","coding request","checking time"]
     try:
+        result = intent_classifier(user_text, candidate_labels)
+        top_label = result['labels'][0]
+        confidence = result['scores'][0]
+        mapping = {
+            "internet search": "internet_search",
+            "general conversation": "general",
+            "coding request": "coding_request",
+            "checking time": "checking_time"
+        }
+        if confidence > 0.45:
+            return mapping.get(top_label, "general")
     except Exception:
         pass
     return "general"
+def perform_web_search(user_text, max_results=4):
     """
+    Return structured results:
+    {
+      "query": "...",
+      "results": [
+         {"title": "...", "snippet": "...", "url": "..."},
+         ...
+      ]
+    }
     """
     try:
+        query = user_text
+        # sanitize small verbs
+        remove_phrases = ["search for","find","google","look up","lookup","what is","tell me"]
         q = query.lower()
+        for p in remove_phrases:
+            q = q.replace(p, "")
         q = q.strip() or query
         results = list(DDGS().text(q, max_results=max_results))
         structured = {"query": q, "results": []}
+        for r in results:
+            title = r.get("title","").strip()
+            body = re.sub(r'\s+',' ', r.get("body","").strip())
             url = r.get("href") or r.get("url") or r.get("link") or ""
+            # short snippet
+            snippet = body[:320]
+            structured["results"].append({"title": title, "snippet": snippet, "url": url})
         return structured
     except Exception as e:
         print(f"Search error: {e}")