Nexari-Research commited on
Commit
81473e3
·
verified ·
1 Parent(s): 49afcdd

Update tools_engine.py

Browse files
Files changed (1) hide show
  1. tools_engine.py +36 -93
tools_engine.py CHANGED
@@ -1,23 +1,14 @@
1
  """
2
- tools_engine.py - Structured web search + page extraction + canonical intent detection
3
- - Uses duckduckgo_search (DDGS) to get URLs
4
- - Fetches pages (requests + BeautifulSoup) to extract readable snippets
5
- - Returns: {"query": "...", "results": [{"title","snippet","url"}, ...]}
6
  """
7
 
8
  from duckduckgo_search import DDGS
9
  from transformers import pipeline
10
- import requests
11
- from bs4 import BeautifulSoup
12
  import re
13
- import time
14
 
15
  print(">>> Tools: Loading Intent Classification Model...")
16
- try:
17
- intent_classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
18
- except Exception as e:
19
- print(f"Warning: intent classifier failed to load: {e}")
20
- intent_classifier = None
21
 
22
  def analyze_intent(user_text):
23
  if not user_text:
@@ -25,105 +16,57 @@ def analyze_intent(user_text):
25
  text_lower = user_text.lower().strip()
26
  direct_chat_triggers = [
27
  "hi","hello","hey","hlo","namaste",
28
- "what is your name","who are you","your name"
29
  ]
30
- if text_lower in direct_chat_triggers or any(text_lower.startswith(t+" ") for t in direct_chat_triggers):
31
  return "general"
32
 
33
  candidate_labels = ["internet search","general conversation","coding request","checking time"]
34
  try:
35
- if intent_classifier:
36
- res = intent_classifier(user_text, candidate_labels)
37
- top = res['labels'][0]
38
- score = res['scores'][0]
39
- mapping = {
40
- "internet search": "internet_search",
41
- "general conversation": "general",
42
- "coding request": "coding_request",
43
- "checking time": "checking_time"
44
- }
45
- if score > 0.45:
46
- return mapping.get(top, "general")
47
  except Exception:
48
  pass
49
  return "general"
50
 
51
- def fetch_snippet_from_url(url, max_chars=320, timeout=6):
52
  """
53
- Fetch page HTML and extract readable snippet using heuristics.
 
 
 
 
 
 
 
54
  """
55
  try:
56
- headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) NexariBot/1.0"}
57
- r = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
58
- if r.status_code != 200 or not r.text:
59
- return ""
60
- soup = BeautifulSoup(r.text, "html.parser")
61
- for s in soup(["script", "style", "noscript", "header", "footer", "form", "nav", "aside"]):
62
- s.extract()
63
-
64
- article = soup.find("article")
65
- main = soup.find("main")
66
- body_text = ""
67
- if article:
68
- body_text = article.get_text(separator=" ", strip=True)
69
- elif main:
70
- body_text = main.get_text(separator=" ", strip=True)
71
- else:
72
- # gather longest paragraphs
73
- texts = [t.get_text(" ", strip=True) for t in soup.find_all(["p","div","span"])]
74
- texts = [t for t in texts if len(t) > 40]
75
- texts = sorted(texts, key=len, reverse=True)
76
- body_text = " ".join(texts[:3]) if texts else soup.get_text(separator=" ", strip=True)
77
-
78
- body_text = re.sub(r'\s+', ' ', (body_text or "")).strip()
79
- if not body_text:
80
- return ""
81
- if len(body_text) <= max_chars:
82
- return body_text
83
- # try to cut at sentence boundary
84
- chunk = body_text[:max_chars+60]
85
- last_period = max(chunk.rfind('. '), chunk.rfind('! '), chunk.rfind('? '))
86
- if last_period > int(max_chars*0.2):
87
- snippet = chunk[:last_period+1]
88
- else:
89
- snippet = body_text[:max_chars].rsplit(' ', 1)[0] + "..."
90
- return snippet
91
- except Exception:
92
- return ""
93
-
94
- def perform_web_search(user_text, max_results=3):
95
- """
96
- Return structured results.
97
- """
98
- try:
99
- query = (user_text or "").strip()
100
- if not query:
101
- return {"query": "", "results": []}
102
- # sanitize
103
- removals = ["search for", "find", "google", "lookup", "look up", "what is", "tell me about"]
104
  q = query.lower()
105
- for r in removals:
106
- q = q.replace(r, "")
107
  q = q.strip() or query
108
 
109
  results = list(DDGS().text(q, max_results=max_results))
110
  structured = {"query": q, "results": []}
111
- if not results:
112
- return structured
113
-
114
- for r in results[:max_results]:
115
- title = (r.get("title") or "").strip()
116
- ddg_body = (r.get("body") or r.get("snippet") or "").strip()
117
  url = r.get("href") or r.get("url") or r.get("link") or ""
118
- snippet = ddg_body
119
- if (not snippet or len(snippet) < 80) and url:
120
- fetched = fetch_snippet_from_url(url, max_chars=320)
121
- if fetched:
122
- snippet = fetched
123
- # fallback truncate
124
- snippet = re.sub(r'\s+', ' ', (snippet or ""))[:320].strip()
125
- structured["results"].append({"title": title or url, "snippet": snippet, "url": url})
126
- time.sleep(0.18) # polite delay
127
  return structured
128
  except Exception as e:
129
  print(f"Search error: {e}")
 
1
  """
2
+ tools_engine.py - Improved perform_web_search to return structured results with URLs and snippets,
3
+ and canonical intent detection unchanged.
 
 
4
  """
5
 
6
  from duckduckgo_search import DDGS
7
  from transformers import pipeline
 
 
8
  import re
 
9
 
10
  print(">>> Tools: Loading Intent Classification Model...")
11
+ intent_classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")
 
 
 
 
12
 
13
  def analyze_intent(user_text):
14
  if not user_text:
 
16
  text_lower = user_text.lower().strip()
17
  direct_chat_triggers = [
18
  "hi","hello","hey","hlo","namaste",
19
+ "what is your name", "who are you", "your name"
20
  ]
21
+ if text_lower in direct_chat_triggers or any(text_lower.startswith(t + " ") for t in direct_chat_triggers):
22
  return "general"
23
 
24
  candidate_labels = ["internet search","general conversation","coding request","checking time"]
25
  try:
26
+ result = intent_classifier(user_text, candidate_labels)
27
+ top_label = result['labels'][0]
28
+ confidence = result['scores'][0]
29
+ mapping = {
30
+ "internet search": "internet_search",
31
+ "general conversation": "general",
32
+ "coding request": "coding_request",
33
+ "checking time": "checking_time"
34
+ }
35
+ if confidence > 0.45:
36
+ return mapping.get(top_label, "general")
 
37
  except Exception:
38
  pass
39
  return "general"
40
 
41
+ def perform_web_search(user_text, max_results=4):
42
  """
43
+ Return structured results:
44
+ {
45
+ "query": "...",
46
+ "results": [
47
+ {"title": "...", "snippet": "...", "url": "..."},
48
+ ...
49
+ ]
50
+ }
51
  """
52
  try:
53
+ query = user_text
54
+ # sanitize small verbs
55
+ remove_phrases = ["search for","find","google","look up","lookup","what is","tell me"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  q = query.lower()
57
+ for p in remove_phrases:
58
+ q = q.replace(p, "")
59
  q = q.strip() or query
60
 
61
  results = list(DDGS().text(q, max_results=max_results))
62
  structured = {"query": q, "results": []}
63
+ for r in results:
64
+ title = r.get("title","").strip()
65
+ body = re.sub(r'\s+',' ', r.get("body","").strip())
 
 
 
66
  url = r.get("href") or r.get("url") or r.get("link") or ""
67
+ # short snippet
68
+ snippet = body[:320]
69
+ structured["results"].append({"title": title, "snippet": snippet, "url": url})
 
 
 
 
 
 
70
  return structured
71
  except Exception as e:
72
  print(f"Search error: {e}")