mibrahimzia commited on
Commit
849a2f3
·
verified ·
1 Parent(s): 801be6e

Update crawler.py

Browse files
Files changed (1) hide show
  1. crawler.py +112 -1
crawler.py CHANGED
@@ -14,4 +14,115 @@ from backend.ai_interpreter import pattern_based_interpreter
14
  logger = logging.getLogger("webtapi.crawler")
15
 
16
  # Rest of your crawler.py code remains exactly the same...
17
- # [Keep all your existing functions and implementation]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  logger = logging.getLogger("webtapi.crawler")
15
 
16
  # Rest of your crawler.py code remains exactly the same...
17
+ # [Keep all your existing functions and implementation]
18
+
19
+
20
+
21
+
22
+ def get_random_user_agent():
23
+ """Return a random user agent to avoid detection"""
24
+ user_agents = [
25
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
27
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
28
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
29
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15"
30
+ ]
31
+ return random.choice(user_agents)
32
+
33
+ def get_domain(url):
34
+ """Extract domain from URL"""
35
+ parsed = urlparse(url)
36
+ return f"{parsed.scheme}://{parsed.netloc}"
37
+
38
+ def is_same_domain(url, base_url):
39
+ """Check if URL belongs to the same domain"""
40
+ return get_domain(url) == get_domain(base_url)
41
+
42
+ def get_links(html_content, base_url):
43
+ """Extract all links from HTML content"""
44
+ soup = BeautifulSoup(html_content, 'lxml')
45
+ links = set()
46
+
47
+ for a in soup.find_all('a', href=True):
48
+ href = a['href']
49
+ if href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
50
+ continue
51
+
52
+ # Resolve relative URLs
53
+ full_url = urljoin(base_url, href)
54
+
55
+ # Only include links from the same domain
56
+ if is_same_domain(full_url, base_url):
57
+ # Normalize URL by removing fragments and query parameters
58
+ parsed_url = urlparse(full_url)
59
+ normalized_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
60
+ if normalized_url:
61
+ links.add(normalized_url)
62
+
63
+ return list(links)
64
+
65
+ def fetch_page(url):
66
+ """Fetch a page with error handling"""
67
+ try:
68
+ headers = {
69
+ "User-Agent": get_random_user_agent(),
70
+ "Accept-Language": "en-US,en;q=0.9",
71
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
72
+ "Accept-Encoding": "gzip, deflate",
73
+ "Connection": "keep-alive",
74
+ "Upgrade-Insecure-Requests": "1"
75
+ }
76
+
77
+ response = requests.get(url, headers=headers, timeout=15)
78
+ response.raise_for_status()
79
+ return response.text, True
80
+ except Exception as e:
81
+ logger.error(f"Failed to fetch {url}: {str(e)}")
82
+ return None, False
83
+
84
+ def crawl_website(start_url, extraction_plan, max_pages=20, max_depth=3, delay=1):
85
+ """
86
+ Crawl a website using BFS algorithm
87
+ """
88
+ domain = get_domain(start_url)
89
+ visited = set()
90
+ queue = deque([(start_url, 0)]) # (url, depth)
91
+ results = []
92
+ page_count = 0
93
+
94
+ while queue and page_count < max_pages:
95
+ url, depth = queue.popleft()
96
+
97
+ if url in visited or depth > max_depth:
98
+ continue
99
+
100
+ visited.add(url)
101
+ logger.info(f"Crawling: {url} (depth: {depth})")
102
+
103
+ # Fetch the page
104
+ html_content, success = fetch_page(url)
105
+ if not success:
106
+ continue
107
+
108
+ # Extract data from the page (we'll process this later)
109
+ page_data = {
110
+ "url": url,
111
+ "depth": depth,
112
+ "html_content": html_content,
113
+ "status": "success"
114
+ }
115
+ results.append(page_data)
116
+ page_count += 1
117
+
118
+ # Get links from this page for further crawling
119
+ if depth < max_depth:
120
+ links = get_links(html_content, url)
121
+ for link in links:
122
+ if link not in visited and link not in [u for u, d in queue]:
123
+ queue.append((link, depth + 1))
124
+
125
+ # Respectful delay
126
+ time.sleep(delay)
127
+
128
+ return results