Musombi commited on
Commit
d47a4be
·
1 Parent(s): 5b92be0

Update reasoning/scraper.py

Browse files
Files changed (1) hide show
  1. reasoning/scraper.py +28 -4
reasoning/scraper.py CHANGED
@@ -8,11 +8,30 @@ HEADERS = {
8
  "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
9
  }
10
 
11
- # SAFE SOURCE ONLY
12
  SOURCES = {
13
  "wikipedia": "https://en.wikipedia.org/wiki/{query}"
14
  }
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # -------------------------
18
  # CLEANING FUNCTIONS
@@ -33,6 +52,7 @@ def is_blocked(text: str) -> bool:
33
  "access denied",
34
  "enable javascript"
35
  ]
 
36
  text_lower = text.lower()
37
  return any(b in text_lower for b in blockers)
38
 
@@ -67,7 +87,6 @@ def scrape_page(url: str) -> str:
67
 
68
  soup = BeautifulSoup(r.text, "html.parser")
69
 
70
- # Remove junk tags
71
  for tag in soup(["script", "style", "noscript"]):
72
  tag.decompose()
73
 
@@ -89,9 +108,14 @@ def scrape_page(url: str) -> str:
89
  def scrape_knowledge(query: str, limit: int = 20) -> List[Dict]:
90
  knowledge = []
91
 
 
 
 
 
 
92
  for source_name, url in SOURCES.items():
93
  try:
94
- formatted_query = query.strip().replace(" ", "_")
95
  full_url = url.format(query=formatted_query)
96
 
97
  page_text = scrape_page(full_url)
@@ -113,4 +137,4 @@ def scrape_knowledge(query: str, limit: int = 20) -> List[Dict]:
113
  except Exception:
114
  continue
115
 
116
- return knowledge
 
8
  "User-Agent": "Mozilla/5.0 (MVI-AI Knowledge Engine)"
9
  }
10
 
 
11
  SOURCES = {
12
  "wikipedia": "https://en.wikipedia.org/wiki/{query}"
13
  }
14
 
15
+ # -------------------------
16
+ # QUERY NORMALIZATION
17
+ # -------------------------
18
+
19
+ def normalize_query(query: str) -> str:
20
+ query = query.lower()
21
+
22
+ stop_phrases = [
23
+ "what is", "who is", "define",
24
+ "explain", "tell me about",
25
+ "what are", "how does"
26
+ ]
27
+
28
+ for phrase in stop_phrases:
29
+ query = query.replace(phrase, "")
30
+
31
+ query = re.sub(r'[^\w\s]', '', query)
32
+
33
+ return query.strip()
34
+
35
 
36
  # -------------------------
37
  # CLEANING FUNCTIONS
 
52
  "access denied",
53
  "enable javascript"
54
  ]
55
+
56
  text_lower = text.lower()
57
  return any(b in text_lower for b in blockers)
58
 
 
87
 
88
  soup = BeautifulSoup(r.text, "html.parser")
89
 
 
90
  for tag in soup(["script", "style", "noscript"]):
91
  tag.decompose()
92
 
 
108
  def scrape_knowledge(query: str, limit: int = 20) -> List[Dict]:
109
  knowledge = []
110
 
111
+ clean_query = normalize_query(query)
112
+
113
+ if not clean_query:
114
+ return knowledge
115
+
116
  for source_name, url in SOURCES.items():
117
  try:
118
+ formatted_query = clean_query.replace(" ", "_")
119
  full_url = url.format(query=formatted_query)
120
 
121
  page_text = scrape_page(full_url)
 
137
  except Exception:
138
  continue
139
 
140
+ return knowledge