Musombi commited on
Commit
6701acf
·
1 Parent(s): 2639f51

Update reasoning/scraper.py

Browse files
Files changed (1) hide show
  1. reasoning/scraper.py +34 -85
reasoning/scraper.py CHANGED
@@ -1,10 +1,9 @@
1
- import aiohttp
2
- import asyncio
3
  import re
4
  import time
5
  import json
6
  import os
7
- from bs4 import BeautifulSoup
8
  from typing import List, Dict
9
 
10
  HEADERS = {
@@ -19,7 +18,7 @@ DDG_SEARCH = "https://duckduckgo.com/html/?q={query}"
19
 
20
 
21
  # -------------------------
22
- # CACHE (PERSISTENT)
23
  # -------------------------
24
 
25
  def load_cache():
@@ -36,38 +35,19 @@ CACHE = load_cache()
36
 
37
 
38
  # -------------------------
39
- # QUERY PROCESSING
40
  # -------------------------
41
 
42
  def normalize_query(query: str) -> str:
43
  query = query.lower()
44
 
45
- stop_phrases = [
46
- "what is", "who is", "define",
47
- "explain", "tell me about",
48
- "what are", "how does"
49
- ]
50
-
51
- for phrase in stop_phrases:
52
  query = query.replace(phrase, "")
53
 
54
  query = re.sub(r"[^\w\s]", "", query)
55
  return query.strip()
56
 
57
 
58
- def expand_query(query: str) -> List[str]:
59
- return [
60
- query,
61
- f"{query} definition",
62
- f"{query} meaning",
63
- f"{query} explanation"
64
- ]
65
-
66
-
67
- # -------------------------
68
- # CLEANING
69
- # -------------------------
70
-
71
  def clean_text(text: str) -> str:
72
  text = re.sub(r"\[\d+\]", "", text)
73
  text = re.sub(r"\s+", " ", text)
@@ -78,23 +58,22 @@ def clean_text(text: str) -> str:
78
  # FETCH
79
  # -------------------------
80
 
81
- async def fetch(session, url):
82
  try:
83
- async with session.get(url, headers=HEADERS, timeout=8) as r:
84
- if r.status != 200:
85
- return ""
86
- return await r.text()
87
  except:
88
  return ""
89
 
90
 
91
  # -------------------------
92
- # PARSERS
93
  # -------------------------
94
 
95
  def extract_paragraphs(html: str) -> List[str]:
96
  soup = BeautifulSoup(html, "html.parser")
97
-
98
  paragraphs = soup.find_all("p")
99
 
100
  results = []
@@ -115,29 +94,25 @@ def extract_wiki(html: str) -> List[str]:
115
 
116
 
117
  # -------------------------
118
- # SEARCH FALLBACKS
119
  # -------------------------
120
 
121
- async def wikipedia_search(session, query):
122
- url = WIKI_SEARCH.format(query=query.replace(" ", "+"))
123
- html = await fetch(session, url)
124
-
125
  soup = BeautifulSoup(html, "html.parser")
126
- result = soup.select_one(".mw-search-result-heading a")
127
 
 
128
  if result:
129
  return "https://en.wikipedia.org" + result.get("href")
130
 
131
  return ""
132
 
133
 
134
- async def duckduckgo_search(session, query):
135
- url = DDG_SEARCH.format(query=query.replace(" ", "+"))
136
- html = await fetch(session, url)
137
-
138
  soup = BeautifulSoup(html, "html.parser")
139
- links = []
140
 
 
141
  for a in soup.select(".result__a"):
142
  href = a.get("href")
143
  if href and href.startswith("http"):
@@ -150,73 +125,41 @@ async def duckduckgo_search(session, query):
150
  # SCRAPERS
151
  # -------------------------
152
 
153
- async def scrape_wikipedia(session, query):
154
  url = WIKI_PAGE.format(query=query.replace(" ", "_"))
155
- html = await fetch(session, url)
156
 
157
  if "Wikipedia does not have an article" in html:
158
- url = await wikipedia_search(session, query)
159
  if not url:
160
  return []
161
-
162
- html = await fetch(session, url)
163
 
164
  return extract_wiki(html)
165
 
166
 
167
- async def scrape_generic(session, url):
168
- html = await fetch(session, url)
169
  return extract_paragraphs(html)
170
 
171
 
172
  # -------------------------
173
- # RANKING (TF STYLE)
174
  # -------------------------
175
 
176
  def rank_results(paragraphs: List[str], query: str) -> List[str]:
177
  q_words = set(query.lower().split())
178
 
179
  def score(p):
180
- words = set(p.lower().split())
181
- return len(q_words & words)
182
 
183
  return sorted(paragraphs, key=score, reverse=True)
184
 
185
 
186
  # -------------------------
187
- # MAIN ENGINE
188
  # -------------------------
189
 
190
- async def async_scrape(query: str, limit: int):
191
- async with aiohttp.ClientSession() as session:
192
-
193
- queries = expand_query(query)
194
-
195
- tasks = []
196
-
197
- # Wikipedia tasks
198
- for q in queries:
199
- tasks.append(scrape_wikipedia(session, q))
200
-
201
- results = await asyncio.gather(*tasks)
202
-
203
- paragraphs = []
204
- for r in results:
205
- paragraphs.extend(r)
206
-
207
- # Fallback to DuckDuckGo if empty
208
- if not paragraphs:
209
- links = await duckduckgo_search(session, query)
210
-
211
- tasks = [scrape_generic(session, link) for link in links]
212
- results = await asyncio.gather(*tasks)
213
-
214
- for r in results:
215
- paragraphs.extend(r)
216
-
217
- return paragraphs
218
-
219
-
220
  def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
221
  if query in CACHE:
222
  return CACHE[query]
@@ -225,7 +168,13 @@ def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
225
  if not clean_query:
226
  return []
227
 
228
- paragraphs = asyncio.run(async_scrape(clean_query, limit))
 
 
 
 
 
 
229
 
230
  if not paragraphs:
231
  return []
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
  import re
4
  import time
5
  import json
6
  import os
 
7
  from typing import List, Dict
8
 
9
  HEADERS = {
 
18
 
19
 
20
  # -------------------------
21
+ # CACHE
22
  # -------------------------
23
 
24
  def load_cache():
 
35
 
36
 
37
  # -------------------------
38
+ # UTIL
39
  # -------------------------
40
 
41
  def normalize_query(query: str) -> str:
42
  query = query.lower()
43
 
44
+ for phrase in ["what is", "who is", "define", "explain"]:
 
 
 
 
 
 
45
  query = query.replace(phrase, "")
46
 
47
  query = re.sub(r"[^\w\s]", "", query)
48
  return query.strip()
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def clean_text(text: str) -> str:
52
  text = re.sub(r"\[\d+\]", "", text)
53
  text = re.sub(r"\s+", " ", text)
 
58
  # FETCH
59
  # -------------------------
60
 
61
+ def fetch(url: str) -> str:
62
  try:
63
+ r = requests.get(url, headers=HEADERS, timeout=8)
64
+ if r.status_code != 200:
65
+ return ""
66
+ return r.text
67
  except:
68
  return ""
69
 
70
 
71
  # -------------------------
72
+ # PARSE
73
  # -------------------------
74
 
75
  def extract_paragraphs(html: str) -> List[str]:
76
  soup = BeautifulSoup(html, "html.parser")
 
77
  paragraphs = soup.find_all("p")
78
 
79
  results = []
 
94
 
95
 
96
  # -------------------------
97
+ # SEARCH FALLBACK
98
  # -------------------------
99
 
100
+ def wikipedia_search(query: str) -> str:
101
+ html = fetch(WIKI_SEARCH.format(query=query.replace(" ", "+")))
 
 
102
  soup = BeautifulSoup(html, "html.parser")
 
103
 
104
+ result = soup.select_one(".mw-search-result-heading a")
105
  if result:
106
  return "https://en.wikipedia.org" + result.get("href")
107
 
108
  return ""
109
 
110
 
111
+ def duckduckgo_search(query: str) -> List[str]:
112
+ html = fetch(DDG_SEARCH.format(query=query.replace(" ", "+")))
 
 
113
  soup = BeautifulSoup(html, "html.parser")
 
114
 
115
+ links = []
116
  for a in soup.select(".result__a"):
117
  href = a.get("href")
118
  if href and href.startswith("http"):
 
125
  # SCRAPERS
126
  # -------------------------
127
 
128
+ def scrape_wikipedia(query: str) -> List[str]:
129
  url = WIKI_PAGE.format(query=query.replace(" ", "_"))
130
+ html = fetch(url)
131
 
132
  if "Wikipedia does not have an article" in html:
133
+ url = wikipedia_search(query)
134
  if not url:
135
  return []
136
+ html = fetch(url)
 
137
 
138
  return extract_wiki(html)
139
 
140
 
141
+ def scrape_generic(url: str) -> List[str]:
142
+ html = fetch(url)
143
  return extract_paragraphs(html)
144
 
145
 
146
  # -------------------------
147
+ # RANKING
148
  # -------------------------
149
 
150
  def rank_results(paragraphs: List[str], query: str) -> List[str]:
151
  q_words = set(query.lower().split())
152
 
153
  def score(p):
154
+ return sum(word in p.lower() for word in q_words)
 
155
 
156
  return sorted(paragraphs, key=score, reverse=True)
157
 
158
 
159
  # -------------------------
160
+ # MAIN
161
  # -------------------------
162
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  def scrape_knowledge(query: str, limit: int = 5) -> List[Dict]:
164
  if query in CACHE:
165
  return CACHE[query]
 
168
  if not clean_query:
169
  return []
170
 
171
+ paragraphs = scrape_wikipedia(clean_query)
172
+
173
+ if not paragraphs:
174
+ links = duckduckgo_search(clean_query)
175
+
176
+ for link in links:
177
+ paragraphs.extend(scrape_generic(link))
178
 
179
  if not paragraphs:
180
  return []