Spaces:

R-Kentaren
/

fullstack-code-builder

Running

App Files Files Community

R-Kentaren commited on 17 days ago

Commit

4176077

verified ·

1 Parent(s): 4412065

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

code/websearch/google_scraper.py +194 -38
index.html +43 -0

code/websearch/google_scraper.py CHANGED Viewed

@@ -1,43 +1,60 @@
-"""Web search via Google scraping — no API key needed.
-Uses requests with a browser-like User-Agent and BeautifulSoup
-to parse Google search result pages.
 """
 from __future__ import annotations
 import logging
 import urllib.parse
 logger = logging.getLogger(__name__)
 def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
-    """Search Google by scraping the results page. No API key needed.
     Returns a list of dicts with keys: title, url, snippet.
-    Uses requests with a browser-like User-Agent to avoid captchas.
     """
     try:
         import requests
         from bs4 import BeautifulSoup
         encoded_query = urllib.parse.quote_plus(query)
-        url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"
-        headers = {
-            "User-Agent": (
-                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) "
-                "Chrome/120.0.0.0 Safari/537.36"
-            ),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Accept-Encoding": "gzip, deflate",
-            "DNT": "1",
-            "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1",
-        }
         resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
         resp.raise_for_status()
@@ -45,8 +62,96 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
         soup = BeautifulSoup(resp.text, "html.parser")
         results: list[dict[str, str]] = []
-        # Parse Google search results
-        for g_div in soup.select("div.g, div[data-sokoban-container], div.yuRUbf"):
             title_el = g_div.select_one("h3")
             link_el = g_div.select_one("a[href]")
             snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
@@ -55,18 +160,8 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
                 continue
             href = link_el.get("href", "")
-            # Google sometimes prefixes URLs; extract the real URL
-            if href.startswith("/url?q="):
-                real_url = urllib.parse.parse_qs(
-                    urllib.parse.urlparse(href).query
-                ).get("q", [href])[0]
-            elif href.startswith("http"):
-                real_url = href
-            else:
-                continue
-            # Skip Google-internal URLs
-            if "google.com" in real_url or "googleusercontent.com" in real_url:
                 continue
             title = title_el.get_text(strip=True)
@@ -82,7 +177,41 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
             if len(results) >= num_results:
                 break
-        # Fallback: try parsing from <a> tags with data-ved attribute
         if not results:
             for a_tag in soup.select("a[data-ved]"):
                 href = a_tag.get("href", "")
@@ -93,29 +222,56 @@ def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
                 title_el = a_tag.select_one("h3, span")
                 title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]
-                snippet = ""
                 if title and href:
                     results.append({
                         "title": title,
                         "url": href,
-                        "snippet": snippet,
                     })
                 if len(results) >= num_results:
                     break
-        logger.info("Web search for '%s' returned %d results", query, len(results))
         return results
     except ImportError:
         logger.warning("requests or beautifulsoup4 not installed for web search")
         return []
     except Exception as exc:
-        logger.exception("Web search failed: %s", exc)
         return []
 def format_search_results(results: list[dict[str, str]]) -> str:
     """Format search results into a text block for model context."""
     if not results:

+"""Web search via scraping — no API key needed.
+Strategy:
+1. Primary: DuckDuckGo HTML (more scraper-friendly, fewer captchas)
+2. Fallback: Google search with robust multi-selector parsing
 """
 from __future__ import annotations
 import logging
+import re
 import urllib.parse
 logger = logging.getLogger(__name__)
+# Common browser-like headers to avoid bot detection
+_BROWSER_HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/125.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+    "Accept-Encoding": "gzip, deflate",
+    "DNT": "1",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+}
 def web_search_google(query: str, num_results: int = 8) -> list[dict[str, str]]:
+    """Search the web by scraping. No API key needed.
+    Tries DuckDuckGo first (more scraper-friendly),
+    then falls back to Google if DuckDuckGo returns nothing.
     Returns a list of dicts with keys: title, url, snippet.
     """
+    results = _search_duckduckgo(query, num_results)
+    if results:
+        return results
+    results = _search_google(query, num_results)
+    return results
+def _search_duckduckgo(query: str, num_results: int) -> list[dict[str, str]]:
+    """Search DuckDuckGo HTML version — very scraper-friendly."""
     try:
         import requests
         from bs4 import BeautifulSoup
         encoded_query = urllib.parse.quote_plus(query)
+        url = f"https://html.duckduckgo.com/html/?q={encoded_query}"
+        headers = {**_BROWSER_HEADERS, "Referer": "https://duckduckgo.com/"}
         resp = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
         results: list[dict[str, str]] = []
+        # DuckDuckGo HTML uses .result blocks
+        for result_div in soup.select(".result"):
+            title_el = result_div.select_one(".result__title a, .result__a")
+            snippet_el = result_div.select_one(".result__snippet")
+            if not title_el:
+                continue
+            title = title_el.get_text(strip=True)
+            # DDG uses redirect URLs like //duckduckgo.com/l/?uddg=...
+            href = title_el.get("href", "")
+            real_url = _extract_ddg_url(href)
+            if not real_url:
+                continue
+            # Skip internal URLs
+            if any(domain in real_url for domain in ["duckduckgo.com", "duck.co"]):
+                continue
+            snippet = snippet_el.get_text(strip=True) if snippet_el else ""
+            if title and real_url:
+                results.append({
+                    "title": title,
+                    "url": real_url,
+                    "snippet": snippet,
+                })
+            if len(results) >= num_results:
+                break
+        logger.info("DuckDuckGo search for '%s' returned %d results", query, len(results))
+        return results
+    except ImportError:
+        logger.warning("requests or beautifulsoup4 not installed for web search")
+        return []
+    except Exception as exc:
+        logger.warning("DuckDuckGo search failed: %s", exc)
+        return []
+def _extract_ddg_url(href: str) -> str | None:
+    """Extract the real URL from a DuckDuckGo redirect link."""
+    if not href:
+        return None
+    # Direct HTTP URL
+    if href.startswith("http"):
+        return href
+    # DDG redirect: //duckduckgo.com/l/?uddg=<encoded_url>&...
+    if "uddg=" in href:
+        parsed = urllib.parse.urlparse(href)
+        params = urllib.parse.parse_qs(parsed.query)
+        uddg = params.get("uddg", [])
+        if uddg:
+            return urllib.parse.unquote(uddg[0])
+    # Sometimes it's a relative redirect
+    if href.startswith("//"):
+        return "https:" + href
+    return None
+def _search_google(query: str, num_results: int) -> list[dict[str, str]]:
+    """Search Google by scraping the results page. Fallback method."""
+    try:
+        import requests
+        from bs4 import BeautifulSoup
+        encoded_query = urllib.parse.quote_plus(query)
+        url = f"https://www.google.com/search?q={encoded_query}&num={num_results + 2}&hl=en"
+        headers = {**_BROWSER_HEADERS, "Referer": "https://www.google.com/"}
+        session = requests.Session()
+        # First get a cookie from Google
+        session.get("https://www.google.com/", headers=headers, timeout=5)
+        resp = session.get(url, headers=headers, timeout=10, allow_redirects=True)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
+        results: list[dict[str, str]] = []
+        # Strategy 1: Modern Google layout — div.g > div.yuRUbf (title+link) + div.VwiC3b (snippet)
+        for g_div in soup.select("div.g"):
             title_el = g_div.select_one("h3")
             link_el = g_div.select_one("a[href]")
             snippet_el = g_div.select_one("div.VwiC3b, span.aCOpRe, div[data-sncf]")
                 continue
             href = link_el.get("href", "")
+            real_url = _extract_google_url(href)
+            if not real_url:
                 continue
             title = title_el.get_text(strip=True)
             if len(results) >= num_results:
                 break
+        # Strategy 2: Fallback — look for any <a> containing an <h3>
+        if not results:
+            for a_tag in soup.find_all("a", href=True):
+                h3 = a_tag.find("h3")
+                if not h3:
+                    continue
+                href = a_tag.get("href", "")
+                real_url = _extract_google_url(href)
+                if not real_url:
+                    continue
+                title = h3.get_text(strip=True)
+                # Try to find a sibling or nearby snippet
+                snippet = ""
+                parent = a_tag.parent
+                if parent:
+                    for _ in range(3):
+                        parent = parent.parent if parent else None
+                    if parent:
+                        snippet_el = parent.select_one("div.VwiC3b, span.aCOpRe, span.st")
+                        if snippet_el:
+                            snippet = snippet_el.get_text(strip=True)
+                if title and real_url:
+                    results.append({
+                        "title": title,
+                        "url": real_url,
+                        "snippet": snippet,
+                    })
+                if len(results) >= num_results:
+                    break
+        # Strategy 3: Last resort — any <a data-ved> with external href
         if not results:
             for a_tag in soup.select("a[data-ved]"):
                 href = a_tag.get("href", "")
                 title_el = a_tag.select_one("h3, span")
                 title = title_el.get_text(strip=True) if title_el else a_tag.get_text(strip=True)[:100]
                 if title and href:
                     results.append({
                         "title": title,
                         "url": href,
+                        "snippet": "",
                     })
                 if len(results) >= num_results:
                     break
+        logger.info("Google search for '%s' returned %d results", query, len(results))
         return results
     except ImportError:
         logger.warning("requests or beautifulsoup4 not installed for web search")
         return []
     except Exception as exc:
+        logger.warning("Google search failed: %s", exc)
         return []
+def _extract_google_url(href: str) -> str | None:
+    """Extract the real URL from a Google search result link."""
+    if not href:
+        return None
+    # Google redirect: /url?q=<real_url>&...
+    if href.startswith("/url?q="):
+        parsed = urllib.parse.urlparse(href)
+        params = urllib.parse.parse_qs(parsed.query)
+        q = params.get("q", [])
+        if q:
+            real_url = q[0]
+            if real_url.startswith("http"):
+                return real_url
+    # Direct HTTP URL
+    if href.startswith("http"):
+        # Skip Google-internal URLs
+        if any(domain in href for domain in [
+            "google.com", "googleusercontent.com",
+            "youtube.com", "gstatic.com",
+        ]):
+            return None
+        return href
+    return None
 def format_search_results(results: list[dict[str, str]]) -> str:
     """Format search results into a text block for model context."""
     if not results:

index.html CHANGED Viewed

@@ -188,6 +188,30 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
   text-shadow: var(--glow-amber);
 }
 /* ═══════════════════════════════════════════════════════
    BANNER
    ═══════════════════════════════════════════════════════ */
@@ -328,6 +352,9 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
 }
 .think-block:not(.open) .think-content { display: none; }
 /* Streaming cursor */
 .streaming-cursor::after {
   content: '\u2588'; animation: blink 0.8s step-end infinite;
@@ -917,6 +944,7 @@ a:hover { text-decoration: underline; text-shadow: var(--glow-cyan); }
         <span class="dot loading" id="model-dot"></span>
         <span id="model-pill-text">MiniCPM5-1B</span>
       </a>
       <button id="btn-new-chat" onclick="newChat()" title="Start a new chat session">[NEW]</button>
     </div>
   </header>
@@ -1094,6 +1122,7 @@ const state = {
   modelReady: false,
   searchEnabled: false,
   lastSearchResults: [],
 };
 // ═══════════════════════════════════════════════════════
@@ -1908,6 +1937,20 @@ function newChat() {
   resetConversation(`Session reset. Welcome back to ${CONFIG.app_title || 'Fullstack Code Builder'}.`);
 }
 // ═══════════════════════════════════════════════════════
 // WEB SEARCH
 // ═══════════════════════════════════════════════════════

   text-shadow: var(--glow-amber);
 }
+.btn-thinking {
+  background: transparent;
+  border: 1px solid var(--border);
+  color: var(--purple);
+  font-family: var(--font-mono);
+  font-size: 11px;
+  padding: 5px 12px;
+  border-radius: var(--radius);
+  cursor: pointer;
+  transition: all var(--transition);
+  letter-spacing: 0.5px;
+}
+.btn-thinking:hover {
+  border-color: var(--purple);
+  background: rgba(168,85,247,0.08);
+  text-shadow: var(--glow-purple);
+}
+.btn-thinking.active {
+  border-color: var(--purple);
+  background: rgba(168,85,247,0.15);
+  color: var(--purple);
+  text-shadow: var(--glow-purple);
+}
 /* ═══════════════════════════════════════════════════════
    BANNER
    ═══════════════════════════════════════════════════════ */
 }
 .think-block:not(.open) .think-content { display: none; }
+/* Hide thinking blocks entirely when toggle is off */
+body.hide-thinking .think-block { display: none; }
 /* Streaming cursor */
 .streaming-cursor::after {
   content: '\u2588'; animation: blink 0.8s step-end infinite;
         <span class="dot loading" id="model-dot"></span>
         <span id="model-pill-text">MiniCPM5-1B</span>
       </a>
+      <button id="btn-thinking" class="btn-thinking active" onclick="toggleThinking()" title="Show/hide thinking blocks">🧠 Think</button>
       <button id="btn-new-chat" onclick="newChat()" title="Start a new chat session">[NEW]</button>
     </div>
   </header>
   modelReady: false,
   searchEnabled: false,
   lastSearchResults: [],
+  showThinking: true,
 };
 // ═══════════════════════════════════════════════════════
   resetConversation(`Session reset. Welcome back to ${CONFIG.app_title || 'Fullstack Code Builder'}.`);
 }
+function toggleThinking() {
+  state.showThinking = !state.showThinking;
+  const btn = document.getElementById('btn-thinking');
+  if (state.showThinking) {
+    btn.classList.add('active');
+    document.body.classList.remove('hide-thinking');
+    btn.textContent = '🧠 Think';
+  } else {
+    btn.classList.remove('active');
+    document.body.classList.add('hide-thinking');
+    btn.textContent = '🧠 Think';
+  }
+}
 // ═══════════════════════════════════════════════════════
 // WEB SEARCH
 // ═══════════════════════════════════════════════════════