LordXido commited on
Commit
d6ddafd
·
verified ·
1 Parent(s): afaf2a8

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +14 -22
crawler.py CHANGED
@@ -2,17 +2,12 @@ import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
4
 
5
- def safe_fetch(url):
6
- try:
7
- return requests.get(url, timeout=5)
8
- except Exception:
9
- return None
10
-
11
  class CodexCrawler:
12
  def __init__(self, seed_urls, max_depth=2):
13
  self.frontier = [(u, 0) for u in seed_urls]
14
  self.seen = set()
15
  self.graph = {}
 
16
 
17
  def crawl(self):
18
  results = {}
@@ -21,21 +16,18 @@ class CodexCrawler:
21
  if url in self.seen or depth > self.max_depth:
22
  continue
23
 
24
- resp = safe_fetch(url)
25
- if not resp or resp.status_code != 200:
 
 
 
 
 
 
 
 
 
 
 
26
  continue
27
-
28
- self.seen.add(url)
29
- results[url] = resp.status_code
30
-
31
- soup = BeautifulSoup(resp.text, "html.parser")
32
- found = []
33
- for a in soup.find_all("a", href=True):
34
- link = urljoin(url, a["href"])
35
- if link not in self.seen:
36
- found.append(link)
37
- self.frontier.append((link, depth + 1))
38
-
39
- self.graph[url] = found
40
-
41
  return results, self.graph
 
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
4
 
 
 
 
 
 
 
5
  class CodexCrawler:
6
  def __init__(self, seed_urls, max_depth=2):
7
  self.frontier = [(u, 0) for u in seed_urls]
8
  self.seen = set()
9
  self.graph = {}
10
+ self.max_depth = max_depth
11
 
12
  def crawl(self):
13
  results = {}
 
16
  if url in self.seen or depth > self.max_depth:
17
  continue
18
 
19
+ try:
20
+ r = requests.get(url, timeout=5)
21
+ if r.status_code != 200:
22
+ continue
23
+ self.seen.add(url)
24
+ results[url] = r.status_code
25
+ soup = BeautifulSoup(r.text, 'html.parser')
26
+ links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
27
+ self.graph[url] = links
28
+ for link in links:
29
+ if link not in self.seen:
30
+ self.frontier.append((link, depth + 1))
31
+ except:
32
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  return results, self.graph