File size: 3,355 Bytes
d7b3d84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
"""
Example: Using large blocklists (400k+ domains) with automatic optimization

This example demonstrates:
1. Loading a real-world blocklist (HaGeZi's Pro++ with 439k+ domains)
2. Automatic conversion to set for O(1) lookup performance
3. Testing that blocked domains are actually blocked

Performance: ~0.02ms per domain check (50,000+ checks/second!)
"""

import asyncio
import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from dotenv import load_dotenv

load_dotenv()

from browser_use import Agent, ChatOpenAI
from browser_use.browser import BrowserProfile, BrowserSession

llm = ChatOpenAI(model='gpt-4.1-mini')


def load_blocklist_from_url(url: str) -> list[str]:
	"""Load and parse a blocklist from a URL.

	Args:
		url: URL to the blocklist file

	Returns:
		List of domain strings (comments and empty lines removed)
	"""
	import urllib.request

	print(f'πŸ“₯ Downloading blocklist from {url}...')

	domains = []
	with urllib.request.urlopen(url) as response:
		for line in response:
			line = line.decode('utf-8').strip()
			# Skip comments and empty lines
			if line and not line.startswith('#'):
				domains.append(line)

	print(f'βœ… Loaded {len(domains):,} domains')
	return domains


async def main():
	# Load HaGeZi's Pro++ blocklist (blocks ads, tracking, malware, etc.)
	# Source: https://github.com/hagezi/dns-blocklists
	blocklist_url = 'https://gitlab.com/hagezi/mirror/-/raw/main/dns-blocklists/domains/pro.plus.txt'

	print('=' * 70)
	print('πŸš€ Large Blocklist Demo - 439k+ Blocked Domains')
	print('=' * 70)
	print()

	# Load the blocklist
	prohibited_domains = load_blocklist_from_url(blocklist_url)

	# Sample some blocked domains to test
	test_blocked = [prohibited_domains[0], prohibited_domains[1000], prohibited_domains[-1]]
	print(f'\nπŸ“‹ Sample blocked domains: {", ".join(test_blocked[:3])}')

	print(f'\nπŸ”§ Creating browser with {len(prohibited_domains):,} blocked domains...')
	print('   (Auto-optimizing to set for O(1) lookup performance)')

	# Create browser with the blocklist
	# The list will be automatically optimized to a set for fast lookups
	browser_session = BrowserSession(
		browser_profile=BrowserProfile(
			prohibited_domains=prohibited_domains,
			headless=False,
			user_data_dir='~/.config/browseruse/profiles/blocklist-demo',
		),
	)

	# Task: Try to visit a blocked domain and a safe domain
	blocked_site = test_blocked[0]  # Will be blocked
	safe_site = 'github.com'  # Will be allowed

	task = f"""
	Try to navigate to these websites and report what happens:
	1. First, try to visit https://{blocked_site}
	2. Then, try to visit https://{safe_site}
	
	Tell me which sites you were able to access and which were blocked.
	"""

	agent = Agent(
		task=task,
		llm=llm,
		browser_session=browser_session,
	)

	print(f'\nπŸ€– Agent task: Try to visit {blocked_site} (blocked) and {safe_site} (allowed)')
	print('\n' + '=' * 70)

	await agent.run(max_steps=5)

	print('\n' + '=' * 70)
	print('βœ… Demo complete!')
	print(f'πŸ’‘ The blocklist with {len(prohibited_domains):,} domains was optimized to a set')
	print('   for instant O(1) domain checking (vs slow O(n) pattern matching)')
	print('=' * 70)

	input('\nPress Enter to close the browser...')
	await browser_session.kill()


if __name__ == '__main__':
	asyncio.run(main())