Spaces:

LordXido
/

CAMA_Omega_v3_Agent

Sleeping

LordXido commited on Jan 8

Commit

d6ddafd

verified ·

1 Parent(s): afaf2a8

Update crawler.py

Files changed (1) hide show

crawler.py CHANGED Viewed

@@ -2,17 +2,12 @@ import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
-def safe_fetch(url):
-    try:
-        return requests.get(url, timeout=5)
-    except Exception:
-        return None
 class CodexCrawler:
     def __init__(self, seed_urls, max_depth=2):
         self.frontier = [(u, 0) for u in seed_urls]
         self.seen = set()
         self.graph = {}
     def crawl(self):
         results = {}
@@ -21,21 +16,18 @@ class CodexCrawler:
             if url in self.seen or depth > self.max_depth:
                 continue
-            resp = safe_fetch(url)
-            if not resp or resp.status_code != 200:
                 continue
-            self.seen.add(url)
-            results[url] = resp.status_code
-            soup = BeautifulSoup(resp.text, "html.parser")
-            found = []
-            for a in soup.find_all("a", href=True):
-                link = urljoin(url, a["href"])
-                if link not in self.seen:
-                    found.append(link)
-                    self.frontier.append((link, depth + 1))
-            self.graph[url] = found
         return results, self.graph

 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 class CodexCrawler:
     def __init__(self, seed_urls, max_depth=2):
         self.frontier = [(u, 0) for u in seed_urls]
         self.seen = set()
         self.graph = {}
+        self.max_depth = max_depth
     def crawl(self):
         results = {}
             if url in self.seen or depth > self.max_depth:
                 continue
+            try:
+                r = requests.get(url, timeout=5)
+                if r.status_code != 200:
+                    continue
+                self.seen.add(url)
+                results[url] = r.status_code
+                soup = BeautifulSoup(r.text, 'html.parser')
+                links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
+                self.graph[url] = links
+                for link in links:
+                    if link not in self.seen:
+                        self.frontier.append((link, depth + 1))
+            except:
                 continue
         return results, self.graph