R-Kentaren commited on
Commit
4176077
Β·
verified Β·
1 Parent(s): 4412065

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. code/websearch/google_scraper.py +194 -38
  2. index.html +43 -0
code/websearch/google_scraper.py CHANGED
@@ -1,43 +1,60 @@
1
- """Web search via Google scraping β€” no API key needed.
2
 
3
- Uses requests with a browser-like User-Agent and BeautifulSoup
4
- to parse Google search result pages.
 
5
  """
6
 
7
  from __future__ import annotations
8
 
9
  import logging
 
10
  import urllib.parse
11
 
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
16
- """Search Google by scraping the results page. No API key needed.
 
 
 
17
 
18
  Returns a list of dicts with keys: title, url, snippet.
19
- Uses requests with a browser-like User-Agent to avoid captchas.
20
  """
 
 
 
 
 
 
 
 
 
 
21
  try:
22
  import requests
23
  from bs4 import BeautifulSoup
24
 
25
  encoded_query = urllib.parse.quote_plus(query)
26
- url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"
27
 
28
- headers = {
29
- "User-Agent": (
30
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
31
- "AppleWebKit/537.36 (KHTML, like Gecko) "
32
- "Chrome/120.0.0.0 Safari/537.36"
33
- ),
34
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
35
- "Accept-Language": "en-US,en;q=0.5",
36
- "Accept-Encoding": "gzip, deflate",
37
- "DNT": "1",
38
- "Connection": "keep-alive",
39
- "Upgrade-Insecure-Requests": "1",
40
- }
41
 
42
  resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
43
  resp.raise_for_status()
@@ -45,8 +62,96 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
45
  soup = BeautifulSoup(resp.text, "html.parser")
46
  results: list[dict[str, str]] = []
47
 
48
- # Parse Google search results
49
- for g_div in soup.select("div.g, div[data-sokoban-container], div.yuRUbf"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  title_el = g_div.select_one("h3")
51
  link_el = g_div.select_one("a[href]")
52
  snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
@@ -55,18 +160,8 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
55
  continue
56
 
57
  href = link_el.get("href", "")
58
- # Google sometimes prefixes URLs; extract the real URL
59
- if href.startswith("/url?q="):
60
- real_url = urllib.parse.parse_qs(
61
- urllib.parse.urlparse(href).query
62
- ).get("q", [href])[0]
63
- elif href.startswith("http"):
64
- real_url = href
65
- else:
66
- continue
67
-
68
- # Skip Google-internal URLs
69
- if "google.com" in real_url or "googleusercontent.com" in real_url:
70
  continue
71
 
72
  title = title_el.get_text(strip=True)
@@ -82,7 +177,41 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
82
  if len(results) >= num_results:
83
  break
84
 
85
- # Fallback: try parsing from <a> tags with data-ved attribute
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  if not results:
87
  for a_tag in soup.select("a[data-ved]"):
88
  href = a_tag.get("href", "")
@@ -93,29 +222,56 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
93
 
94
  title_el = a_tag.select_one("h3, span")
95
  title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]
96
- snippet = ""
97
 
98
  if title and href:
99
  results.append({
100
  "title": title,
101
  "url": href,
102
- "snippet": snippet,
103
  })
104
 
105
  if len(results) >= num_results:
106
  break
107
 
108
- logger.info("Web search for '%s' returned %d results", query, len(results))
109
  return results
110
 
111
  except ImportError:
112
  logger.warning("requests or beautifulsoup4 not installed for web search")
113
  return []
114
  except Exception as exc:
115
- logger.exception("Web search failed: %s", exc)
116
  return []
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def format_search_results(results: list[dict[str, str]]) -> str:
120
  """Format search results into a text block for model context."""
121
  if not results:
 
1
+ """Web search via scraping β€” no API key needed.
2
 
3
+ Strategy:
4
+ 1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
5
+ 2. Fallback: Google search with robust multi-selector parsing
6
  """
7
 
8
  from __future__ import annotations
9
 
10
  import logging
11
+ import re
12
  import urllib.parse
13
 
14
  logger = logging.getLogger(__name__)
15
 
16
+ # Common browser-like headers to avoid bot detection
17
+ _BROWSER_HEADERS = {
18
+ "User-Agent": (
19
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
20
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
21
+ "Chrome/125.0.0.0 Safari/537.36"
22
+ ),
23
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
24
+ "Accept-Language": "en-US,en;q=0.5",
25
+ "Accept-Encoding": "gzip, deflate",
26
+ "DNT": "1",
27
+ "Connection": "keep-alive",
28
+ "Upgrade-Insecure-Requests": "1",
29
+ }
30
+
31
 
32
  def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
33
+ """Search the web by scraping. No API key needed.
34
+
35
+ Tries DuckDuckGo first (more scraper-friendly),
36
+ then falls back to Google if DuckDuckGo returns nothing.
37
 
38
  Returns a list of dicts with keys: title, url, snippet.
 
39
  """
40
+ results = _search_duckduckgo(query, num_results)
41
+ if results:
42
+ return results
43
+
44
+ results = _search_google(query, num_results)
45
+ return results
46
+
47
+
48
+ def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
49
+ """Search DuckDuckGo HTML version β€” very scraper-friendly."""
50
  try:
51
  import requests
52
  from bs4 import BeautifulSoup
53
 
54
  encoded_query = urllib.parse.quote_plus(query)
55
+ url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
56
 
57
+ headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
60
  resp.raise_for_status()
 
62
  soup = BeautifulSoup(resp.text, "html.parser")
63
  results: list[dict[str, str]] = []
64
 
65
+ # DuckDuckGo HTML uses .result blocks
66
+ for result_div in soup.select(".result"):
67
+ title_el = result_div.select_one(".result__title a, .result__a")
68
+ snippet_el = result_div.select_one(".result__snippet")
69
+
70
+ if not title_el:
71
+ continue
72
+
73
+ title = title_el.get_text(strip=True)
74
+ # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
75
+ href = title_el.get("href", "")
76
+
77
+ real_url = _extract_ddg_url(href)
78
+ if not real_url:
79
+ continue
80
+
81
+ # Skip internal URLs
82
+ if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
83
+ continue
84
+
85
+ snippet = snippet_el.get_text(strip=True) if snippet_el else ""
86
+
87
+ if title and real_url:
88
+ results.append({
89
+ "title": title,
90
+ "url": real_url,
91
+ "snippet": snippet,
92
+ })
93
+
94
+ if len(results) >= num_results:
95
+ break
96
+
97
+ logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
98
+ return results
99
+
100
+ except ImportError:
101
+ logger.warning("requests or beautifulsoup4 not installed for web search")
102
+ return []
103
+ except Exception as exc:
104
+ logger.warning("DuckDuckGo search failed: %s", exc)
105
+ return []
106
+
107
+
108
+ def _extract_ddg_url(href: str) -> str | None:
109
+ """Extract the real URL from a DuckDuckGo redirect link."""
110
+ if not href:
111
+ return None
112
+
113
+ # Direct HTTP URL
114
+ if href.startswith("http"):
115
+ return href
116
+
117
+ # DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
118
+ if "uddg=" in href:
119
+ parsed = urllib.parse.urlparse(href)
120
+ params = urllib.parse.parse_qs(parsed.query)
121
+ uddg = params.get("uddg", [])
122
+ if uddg:
123
+ return urllib.parse.unquote(uddg[0])
124
+
125
+ # Sometimes it's a relative redirect
126
+ if href.startswith("//"):
127
+ return "https:" + href
128
+
129
+ return None
130
+
131
+
132
+ def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
133
+ """Search Google by scraping the results page. Fallback method."""
134
+ try:
135
+ import requests
136
+ from bs4 import BeautifulSoup
137
+
138
+ encoded_query = urllib.parse.quote_plus(query)
139
+ url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"
140
+
141
+ headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}
142
+
143
+ session = requests.Session()
144
+ # First get a cookie from Google
145
+ session.get("https://www.google.com/", headers=headers, timeout=5)
146
+
147
+ resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
148
+ resp.raise_for_status()
149
+
150
+ soup = BeautifulSoup(resp.text, "html.parser")
151
+ results: list[dict[str, str]] = []
152
+
153
+ # Strategy 1: Modern Google layout β€” div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
154
+ for g_div in soup.select("div.g"):
155
  title_el = g_div.select_one("h3")
156
  link_el = g_div.select_one("a[href]")
157
  snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
 
160
  continue
161
 
162
  href = link_el.get("href", "")
163
+ real_url = _extract_google_url(href)
164
+ if not real_url:
 
 
 
 
 
 
 
 
 
 
165
  continue
166
 
167
  title = title_el.get_text(strip=True)
 
177
  if len(results) >= num_results:
178
  break
179
 
180
+ # Strategy 2: Fallback β€” look for any <a> containing an <h3>
181
+ if not results:
182
+ for a_tag in soup.find_all("a", href=True):
183
+ h3 = a_tag.find("h3")
184
+ if not h3:
185
+ continue
186
+
187
+ href = a_tag.get("href", "")
188
+ real_url = _extract_google_url(href)
189
+ if not real_url:
190
+ continue
191
+
192
+ title = h3.get_text(strip=True)
193
+ # Try to find a sibling or nearby snippet
194
+ snippet = ""
195
+ parent = a_tag.parent
196
+ if parent:
197
+ for _ in range(3):
198
+ parent = parent.parent if parent else None
199
+ if parent:
200
+ snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
201
+ if snippet_el:
202
+ snippet = snippet_el.get_text(strip=True)
203
+
204
+ if title and real_url:
205
+ results.append({
206
+ "title": title,
207
+ "url": real_url,
208
+ "snippet": snippet,
209
+ })
210
+
211
+ if len(results) >= num_results:
212
+ break
213
+
214
+ # Strategy 3: Last resort β€” any <a data-ved> with external href
215
  if not results:
216
  for a_tag in soup.select("a[data-ved]"):
217
  href = a_tag.get("href", "")
 
222
 
223
  title_el = a_tag.select_one("h3, span")
224
  title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]
 
225
 
226
  if title and href:
227
  results.append({
228
  "title": title,
229
  "url": href,
230
+ "snippet": "",
231
  })
232
 
233
  if len(results) >= num_results:
234
  break
235
 
236
+ logger.info("Google search for '%s' returned %d results", query, len(results))
237
  return results
238
 
239
  except ImportError:
240
  logger.warning("requests or beautifulsoup4 not installed for web search")
241
  return []
242
  except Exception as exc:
243
+ logger.warning("Google search failed: %s", exc)
244
  return []
245
 
246
 
247
+ def _extract_google_url(href: str) -> str | None:
248
+ """Extract the real URL from a Google search result link."""
249
+ if not href:
250
+ return None
251
+
252
+ # Google redirect: /url?q=<real_url>&...
253
+ if href.startswith("/url?q="):
254
+ parsed = urllib.parse.urlparse(href)
255
+ params = urllib.parse.parse_qs(parsed.query)
256
+ q = params.get("q", [])
257
+ if q:
258
+ real_url = q[0]
259
+ if real_url.startswith("http"):
260
+ return real_url
261
+
262
+ # Direct HTTP URL
263
+ if href.startswith("http"):
264
+ # Skip Google-internal URLs
265
+ if any(domain in href for domain in [
266
+ "google.com", "googleusercontent.com",
267
+ "youtube.com", "gstatic.com",
268
+ ]):
269
+ return None
270
+ return href
271
+
272
+ return None
273
+
274
+
275
  def format_search_results(results: list[dict[str, str]]) -> str:
276
  """Format search results into a text block for model context."""
277
  if not results:
index.html CHANGED
@@ -188,6 +188,30 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
188
  text-shadow: var(--glow-amber);
189
  }
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  /* ═══════════════════════════════════════════════════════
192
  BANNER
193
  ═══════════════════════════════════════════════════════ */
@@ -328,6 +352,9 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
328
  }
329
  .think-block:not(.open) .think-content { display: none; }
330
 
 
 
 
331
  /* Streaming cursor */
332
  .streaming-cursor::after {
333
  content: '\u2588'; animation: blink 0.8s step-end infinite;
@@ -917,6 +944,7 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
917
  <span class="dot loading" id="model-dot"></span>
918
  <span id="model-pill-text">MiniCPM5-1B</span>
919
  </a>
 
920
  <button id="btn-new-chat" onclick="newChat()" title="Start a new chat session">[NEW]</button>
921
  </div>
922
  </header>
@@ -1094,6 +1122,7 @@ const state = {
1094
  modelReady: false,
1095
  searchEnabled: false,
1096
  lastSearchResults: [],
 
1097
  };
1098
 
1099
  // ═══════════════════════════════════════════════════════
@@ -1908,6 +1937,20 @@ function newChat() {
1908
  resetConversation(`Session reset. Welcome back to ${CONFIG.app_title || 'Fullstack Code Builder'}.`);
1909
  }
1910
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1911
  // ═══════════════════════════════════════════════════════
1912
  // WEB SEARCH
1913
  // ═══════════════════════════════════════════════════════
 
188
  text-shadow: var(--glow-amber);
189
  }
190
 
191
+ .btn-thinking {
192
+ background: transparent;
193
+ border: 1px solid var(--border);
194
+ color: var(--purple);
195
+ font-family: var(--font-mono);
196
+ font-size: 11px;
197
+ padding: 5px 12px;
198
+ border-radius: var(--radius);
199
+ cursor: pointer;
200
+ transition: all var(--transition);
201
+ letter-spacing: 0.5px;
202
+ }
203
+ .btn-thinking:hover {
204
+ border-color: var(--purple);
205
+ background: rgba(168,85,247,0.08);
206
+ text-shadow: var(--glow-purple);
207
+ }
208
+ .btn-thinking.active {
209
+ border-color: var(--purple);
210
+ background: rgba(168,85,247,0.15);
211
+ color: var(--purple);
212
+ text-shadow: var(--glow-purple);
213
+ }
214
+
215
  /* ═══════════════════════════════════════════════════════
216
  BANNER
217
  ═══════════════════════════════════════════════════════ */
 
352
  }
353
  .think-block:not(.open) .think-content { display: none; }
354
 
355
+ /* Hide thinking blocks entirely when toggle is off */
356
+ body.hide-thinking .think-block { display: none; }
357
+
358
  /* Streaming cursor */
359
  .streaming-cursor::after {
360
  content: '\u2588'; animation: blink 0.8s step-end infinite;
 
944
  <span class="dot loading" id="model-dot"></span>
945
  <span id="model-pill-text">MiniCPM5-1B</span>
946
  </a>
947
+ <button id="btn-thinking" class="btn-thinking active" onclick="toggleThinking()" title="Show/hide thinking blocks">🧠 Think</button>
948
  <button id="btn-new-chat" onclick="newChat()" title="Start a new chat session">[NEW]</button>
949
  </div>
950
  </header>
 
1122
  modelReady: false,
1123
  searchEnabled: false,
1124
  lastSearchResults: [],
1125
+ showThinking: true,
1126
  };
1127
 
1128
  // ═══════════════════════════════════════════════════════
 
1937
  resetConversation(`Session reset. Welcome back to ${CONFIG.app_title || 'Fullstack Code Builder'}.`);
1938
  }
1939
 
1940
+ function toggleThinking() {
1941
+ state.showThinking = !state.showThinking;
1942
+ const btn = document.getElementById('btn-thinking');
1943
+ if (state.showThinking) {
1944
+ btn.classList.add('active');
1945
+ document.body.classList.remove('hide-thinking');
1946
+ btn.textContent = '🧠 Think';
1947
+ } else {
1948
+ btn.classList.remove('active');
1949
+ document.body.classList.add('hide-thinking');
1950
+ btn.textContent = '🧠 Think';
1951
+ }
1952
+ }
1953
+
1954
  // ═══════════════════════════════════════════════════════
1955
  // WEB SEARCH
1956
  // ═══════════════════════════════════════════════════════