Spaces:
Sleeping
Sleeping
Update crawler.py
Browse files- crawler.py +14 -22
crawler.py
CHANGED
|
@@ -2,17 +2,12 @@ import requests
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
from urllib.parse import urljoin
|
| 4 |
|
| 5 |
-
def safe_fetch(url):
|
| 6 |
-
try:
|
| 7 |
-
return requests.get(url, timeout=5)
|
| 8 |
-
except Exception:
|
| 9 |
-
return None
|
| 10 |
-
|
| 11 |
class CodexCrawler:
|
| 12 |
def __init__(self, seed_urls, max_depth=2):
|
| 13 |
self.frontier = [(u, 0) for u in seed_urls]
|
| 14 |
self.seen = set()
|
| 15 |
self.graph = {}
|
|
|
|
| 16 |
|
| 17 |
def crawl(self):
|
| 18 |
results = {}
|
|
@@ -21,21 +16,18 @@ class CodexCrawler:
|
|
| 21 |
if url in self.seen or depth > self.max_depth:
|
| 22 |
continue
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
continue
|
| 27 |
-
|
| 28 |
-
self.seen.add(url)
|
| 29 |
-
results[url] = resp.status_code
|
| 30 |
-
|
| 31 |
-
soup = BeautifulSoup(resp.text, "html.parser")
|
| 32 |
-
found = []
|
| 33 |
-
for a in soup.find_all("a", href=True):
|
| 34 |
-
link = urljoin(url, a["href"])
|
| 35 |
-
if link not in self.seen:
|
| 36 |
-
found.append(link)
|
| 37 |
-
self.frontier.append((link, depth + 1))
|
| 38 |
-
|
| 39 |
-
self.graph[url] = found
|
| 40 |
-
|
| 41 |
return results, self.graph
|
|
|
|
| 2 |
from bs4 import BeautifulSoup
|
| 3 |
from urllib.parse import urljoin
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
class CodexCrawler:
|
| 6 |
def __init__(self, seed_urls, max_depth=2):
|
| 7 |
self.frontier = [(u, 0) for u in seed_urls]
|
| 8 |
self.seen = set()
|
| 9 |
self.graph = {}
|
| 10 |
+
self.max_depth = max_depth
|
| 11 |
|
| 12 |
def crawl(self):
|
| 13 |
results = {}
|
|
|
|
| 16 |
if url in self.seen or depth > self.max_depth:
|
| 17 |
continue
|
| 18 |
|
| 19 |
+
try:
|
| 20 |
+
r = requests.get(url, timeout=5)
|
| 21 |
+
if r.status_code != 200:
|
| 22 |
+
continue
|
| 23 |
+
self.seen.add(url)
|
| 24 |
+
results[url] = r.status_code
|
| 25 |
+
soup = BeautifulSoup(r.text, 'html.parser')
|
| 26 |
+
links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
|
| 27 |
+
self.graph[url] = links
|
| 28 |
+
for link in links:
|
| 29 |
+
if link not in self.seen:
|
| 30 |
+
self.frontier.append((link, depth + 1))
|
| 31 |
+
except:
|
| 32 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return results, self.graph
|