exorcist123 commited on
Commit
7a4585e
Β·
1 Parent(s): 438de50

add visa availability crawler

Browse files
.gitignore CHANGED
@@ -1,6 +1,6 @@
1
  **/__pycache__/
2
  **/*.pyc
3
-
4
  .env
5
  *.png
6
  *.log
 
1
  **/__pycache__/
2
  **/*.pyc
3
+ venv/
4
  .env
5
  *.png
6
  *.log
visa_availability_scraper_playwright.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Dict, Optional, List
4
+ from playwright.async_api import async_playwright
5
+
6
+ class PassportIndexVisaScraper:
7
+ def __init__(self, debug: bool = True):
8
+ """
9
+ Initialize the Passport Index visa scraper using Playwright
10
+
11
+ Args:
12
+ debug: Enable debug output
13
+ """
14
+ self.base_url = "https://www.passportindex.org/travel-visa-checker/"
15
+ self.api_url = "https://www.passportindex.org/core/visachecker.php"
16
+ self.debug = debug
17
+ self.browser = None
18
+ self.context = None
19
+ self.page = None
20
+
21
+ async def __aenter__(self):
22
+ """Initialize browser with stealth mode"""
23
+ self.playwright = await async_playwright().start()
24
+
25
+ # Launch browser with stealth settings
26
+ self.browser = await self.playwright.chromium.launch(
27
+ headless=True, # Using headless mode
28
+ args=[
29
+ '--disable-blink-features=AutomationControlled',
30
+ '--disable-dev-shm-usage',
31
+ '--no-sandbox',
32
+ '--disable-setuid-sandbox',
33
+ '--disable-web-security',
34
+ '--disable-features=IsolateOrigins,site-per-process'
35
+ ]
36
+ )
37
+
38
+ # Create context with realistic settings
39
+ self.context = await self.browser.new_context(
40
+ viewport={'width': 1920, 'height': 1080},
41
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36',
42
+ locale='en-US',
43
+ timezone_id='America/New_York'
44
+ )
45
+
46
+ self.page = await self.context.new_page()
47
+
48
+ # Add stealth JavaScript to avoid detection
49
+ await self.page.add_init_script("""
50
+ // Override the navigator.webdriver property
51
+ Object.defineProperty(navigator, 'webdriver', {
52
+ get: () => undefined
53
+ });
54
+
55
+ // Override chrome property
56
+ window.chrome = {
57
+ runtime: {}
58
+ };
59
+
60
+ // Override permissions
61
+ const originalQuery = window.navigator.permissions.query;
62
+ window.navigator.permissions.query = (parameters) => (
63
+ parameters.name === 'notifications' ?
64
+ Promise.resolve({ state: Notification.permission }) :
65
+ originalQuery(parameters)
66
+ );
67
+ """)
68
+
69
+ if self.debug:
70
+ print("πŸš€ Browser initialized with stealth mode")
71
+
72
+ return self
73
+
74
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
75
+ """Clean up browser resources"""
76
+ if self.page:
77
+ await self.page.close()
78
+ if self.context:
79
+ await self.context.close()
80
+ if self.browser:
81
+ await self.browser.close()
82
+ if self.playwright:
83
+ await self.playwright.stop()
84
+
85
+ if self.debug:
86
+ print("πŸ”’ Browser closed")
87
+
88
+ async def initialize_session(self) -> bool:
89
+ """
90
+ Navigate to the website and wait for it to load properly
91
+ """
92
+ try:
93
+ if self.debug:
94
+ print("πŸ“± Initializing session...")
95
+
96
+ # Navigate to the page
97
+ try:
98
+ response = await self.page.goto(
99
+ self.base_url,
100
+ wait_until='domcontentloaded',
101
+ timeout=30000
102
+ )
103
+ await self.page.wait_for_timeout(3000)
104
+
105
+ # Get the cl value from the page
106
+ cl_value = await self.page.evaluate("""
107
+ () => {
108
+ const clInput = document.querySelector('#cl');
109
+ return clInput ? clInput.value : 'bc2140a2d83928ce1112d01e610bad89';
110
+ }
111
+ """)
112
+
113
+ if self.debug:
114
+ print(f"βœ… Page loaded, session ID: {cl_value}")
115
+
116
+ return True
117
+
118
+ except Exception as e:
119
+ if self.debug:
120
+ print(f"⚠️ Page load issue: {e}, continuing anyway...")
121
+ return True
122
+
123
+ except Exception as e:
124
+ print(f"❌ Error initializing session: {e}")
125
+ return False
126
+
127
+ async def check_visa_requirement_browser(self, passport_country: str, destination_country: str) -> Optional[Dict]:
128
+ """
129
+ Check visa requirements using browser automation
130
+
131
+ Args:
132
+ passport_country: Two-letter country code for passport
133
+ destination_country: Two-letter country code for destination
134
+
135
+ Returns:
136
+ Dictionary with visa information or None if failed
137
+ """
138
+ try:
139
+ if self.debug:
140
+ print(f"🌐 Checking {passport_country.upper()} β†’ {destination_country.upper()}")
141
+
142
+ # Get the current session ID from the page
143
+ cl_value = await self.page.evaluate("""
144
+ () => {
145
+ const clInput = document.querySelector('#cl');
146
+ return clInput ? clInput.value : 'bc2140a2d83928ce1112d01e610bad89';
147
+ }
148
+ """)
149
+
150
+ # Make the API request through the browser with proper argument passing
151
+ result = await self.page.evaluate("""
152
+ async (args) => {
153
+ const [passport, destination, sessionId] = args;
154
+ const formData = new URLSearchParams();
155
+ formData.append('d', destination);
156
+ formData.append('s', passport);
157
+ formData.append('cl', sessionId);
158
+
159
+ try {
160
+ const response = await fetch('https://www.passportindex.org/core/visachecker.php', {
161
+ method: 'POST',
162
+ headers: {
163
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
164
+ 'X-Requested-With': 'XMLHttpRequest',
165
+ 'Accept': 'application/json, text/javascript, */*; q=0.01'
166
+ },
167
+ body: formData.toString(),
168
+ credentials: 'include'
169
+ });
170
+
171
+ if (!response.ok) {
172
+ throw new Error(`HTTP ${response.status}`);
173
+ }
174
+
175
+ const data = await response.json();
176
+ return data;
177
+ } catch (error) {
178
+ return { error: error.message };
179
+ }
180
+ }
181
+ """, [passport_country.lower(), destination_country.lower(), cl_value])
182
+
183
+ if result and 'error' not in result:
184
+ if self.debug:
185
+ print(f"βœ… Got result: {result}")
186
+ return result
187
+ elif result and 'error' in result:
188
+ print(f"❌ API Error: {result['error']}")
189
+ return None
190
+ else:
191
+ return None
192
+
193
+ except Exception as e:
194
+ print(f"❌ Error checking visa requirement: {e}")
195
+ return None
196
+
197
+ async def check_visa_interactive(self, passport_country: str, destination_country: str) -> Optional[Dict]:
198
+ """
199
+ Alternative method: Use the interactive UI to check visa requirements
200
+ """
201
+ try:
202
+ if self.debug:
203
+ print(f"πŸ–±οΈ Using interactive method for {passport_country.upper()} β†’ {destination_country.upper()}")
204
+
205
+ # Click on the passport selector
206
+ await self.page.click('.vch-select-pass')
207
+ await self.page.wait_for_timeout(500)
208
+
209
+ # Find and click the country in the list
210
+ passport_selector = f'.vch-passports .s-div[data-ccode="{passport_country.lower()}"]'
211
+ await self.page.wait_for_selector(passport_selector, timeout=5000)
212
+ await self.page.click(passport_selector)
213
+ await self.page.wait_for_timeout(500)
214
+
215
+ # Click on the destination selector
216
+ await self.page.click('.vch-select-des')
217
+ await self.page.wait_for_timeout(500)
218
+
219
+ # Find and click the destination country
220
+ dest_selector = f'.vch-destinations .s-div[data-ccode="{destination_country.lower()}"]'
221
+ await self.page.wait_for_selector(dest_selector, timeout=5000)
222
+ await self.page.click(dest_selector)
223
+ await self.page.wait_for_timeout(1000)
224
+
225
+ # Get the result from the page
226
+ result = await self.page.evaluate("""
227
+ () => {
228
+ const resultElement = document.querySelector('.vch-result');
229
+ if (resultElement) {
230
+ const text = resultElement.querySelector('.text');
231
+ const days = resultElement.querySelector('.days');
232
+ return {
233
+ text: text ? text.textContent : '',
234
+ days: days ? days.textContent : '',
235
+ pass: '""" + passport_country.lower() + """',
236
+ dest: '""" + destination_country.upper() + """'
237
+ };
238
+ }
239
+ return null;
240
+ }
241
+ """)
242
+
243
+ return result
244
+
245
+ except Exception as e:
246
+ if self.debug:
247
+ print(f"❌ Interactive method failed: {e}")
248
+ return None
249
+
250
+ async def check_multiple_destinations(self, passport_country: str, destinations: List[str], delay: float = 2.0) -> Dict:
251
+ """
252
+ Check visa requirements for multiple destinations
253
+
254
+ Args:
255
+ passport_country: Two-letter country code for passport
256
+ destinations: List of two-letter country codes for destinations
257
+ delay: Delay between requests in seconds
258
+
259
+ Returns:
260
+ Dictionary mapping destination codes to visa information
261
+ """
262
+ results = {}
263
+
264
+ for i, dest in enumerate(destinations, 1):
265
+ print(f"\n[{i}/{len(destinations)}] Checking {passport_country.upper()} β†’ {dest.upper()}...")
266
+
267
+ # Try API method first
268
+ result = await self.check_visa_requirement_browser(passport_country, dest)
269
+
270
+ # If API fails, try interactive method
271
+ if not result:
272
+ result = await self.check_visa_interactive(passport_country, dest)
273
+
274
+ if result:
275
+ results[dest] = result
276
+ text = result.get('text', 'No text available')
277
+ print(f" βœ… Result: {text}")
278
+ else:
279
+ results[dest] = None
280
+ print(f" ❌ Failed to get result")
281
+
282
+ # Rate limiting
283
+ if i < len(destinations):
284
+ print(f" ⏳ Waiting {delay} seconds...")
285
+ await asyncio.sleep(delay)
286
+
287
+ return results
288
+
289
+ def format_result(self, result: Dict) -> str:
290
+ """Format a single result for display"""
291
+ if not result:
292
+ return "No information available"
293
+
294
+ text = result.get('text', 'N/A')
295
+ dest = result.get('dest', 'N/A')
296
+ passport = result.get('pass', 'N/A')
297
+
298
+ return f"{passport.upper()} β†’ {dest.upper()}: {text}"
299
+
300
+
301
+ async def main():
302
+ """Main function to demonstrate usage"""
303
+ print("="*60)
304
+ print(" Passport Index Visa Checker (Playwright)")
305
+ print("="*60)
306
+
307
+ async with PassportIndexVisaScraper(debug=True) as scraper:
308
+ # Initialize session
309
+ if not await scraper.initialize_session():
310
+ print("❌ Failed to initialize session")
311
+ return
312
+
313
+ print("\n" + "="*60)
314
+ print(" Testing visa requirements...")
315
+ print("="*60)
316
+
317
+ # Test single visa requirement
318
+ print("\nπŸ“ Single visa check: US β†’ GB")
319
+ print("-" * 40)
320
+ result = await scraper.check_visa_requirement_browser('us', 'gb')
321
+ if result:
322
+ print(f"Result: {scraper.format_result(result)}")
323
+ else:
324
+ print("Trying interactive method...")
325
+ result = await scraper.check_visa_interactive('us', 'gb')
326
+ if result:
327
+ print(f"Result: {scraper.format_result(result)}")
328
+
329
+ # Test multiple destinations
330
+ print("\nπŸ“ Multiple destinations for US passport:")
331
+ print("-" * 40)
332
+ destinations = ['ca', 'mx', 'jp', 'au'] # Canada, Mexico, Japan, Australia
333
+ results = await scraper.check_multiple_destinations('us', destinations, delay=2.0)
334
+
335
+ print("\nπŸ“Š Summary:")
336
+ for dest, result in results.items():
337
+ if result:
338
+ print(f" βœ… {scraper.format_result(result)}")
339
+ else:
340
+ print(f" ❌ US β†’ {dest.upper()}: Failed")
341
+
342
+
343
+ # Country codes reference (partial list)
344
+ COUNTRY_CODES = {
345
+ 'af': 'Afghanistan', 'al': 'Albania', 'dz': 'Algeria', 'ad': 'Andorra',
346
+ 'ao': 'Angola', 'ag': 'Antigua and Barbuda', 'ar': 'Argentina', 'am': 'Armenia',
347
+ 'au': 'Australia', 'at': 'Austria', 'az': 'Azerbaijan', 'bs': 'Bahamas',
348
+ 'bh': 'Bahrain', 'bd': 'Bangladesh', 'bb': 'Barbados', 'by': 'Belarus',
349
+ 'be': 'Belgium', 'bz': 'Belize', 'bj': 'Benin', 'bt': 'Bhutan',
350
+ 'bo': 'Bolivia', 'ba': 'Bosnia and Herzegovina', 'bw': 'Botswana', 'br': 'Brazil',
351
+ 'bn': 'Brunei', 'bg': 'Bulgaria', 'bf': 'Burkina Faso', 'bi': 'Burundi',
352
+ 'kh': 'Cambodia', 'cm': 'Cameroon', 'ca': 'Canada', 'cv': 'Cape Verde',
353
+ 'cf': 'Central African Republic', 'td': 'Chad', 'cl': 'Chile', 'cn': 'China',
354
+ 'co': 'Colombia', 'km': 'Comoros', 'cg': 'Congo', 'cr': 'Costa Rica',
355
+ 'hr': 'Croatia', 'cu': 'Cuba', 'cy': 'Cyprus', 'cz': 'Czech Republic',
356
+ 'dk': 'Denmark', 'dj': 'Djibouti', 'dm': 'Dominica', 'do': 'Dominican Republic',
357
+ 'ec': 'Ecuador', 'eg': 'Egypt', 'sv': 'El Salvador', 'gq': 'Equatorial Guinea',
358
+ 'er': 'Eritrea', 'ee': 'Estonia', 'et': 'Ethiopia', 'fj': 'Fiji',
359
+ 'fi': 'Finland', 'fr': 'France', 'ga': 'Gabon', 'gm': 'Gambia',
360
+ 'ge': 'Georgia', 'de': 'Germany', 'gh': 'Ghana', 'gr': 'Greece',
361
+ 'gd': 'Grenada', 'gt': 'Guatemala', 'gn': 'Guinea', 'gw': 'Guinea-Bissau',
362
+ 'gy': 'Guyana', 'ht': 'Haiti', 'hn': 'Honduras', 'hu': 'Hungary',
363
+ 'is': 'Iceland', 'in': 'India', 'id': 'Indonesia', 'ir': 'Iran',
364
+ 'iq': 'Iraq', 'ie': 'Ireland', 'il': 'Israel', 'it': 'Italy',
365
+ 'jm': 'Jamaica', 'jp': 'Japan', 'jo': 'Jordan', 'kz': 'Kazakhstan',
366
+ 'ke': 'Kenya', 'ki': 'Kiribati', 'kp': 'North Korea', 'kr': 'South Korea',
367
+ 'kw': 'Kuwait', 'kg': 'Kyrgyzstan', 'la': 'Laos', 'lv': 'Latvia',
368
+ 'lb': 'Lebanon', 'ls': 'Lesotho', 'lr': 'Liberia', 'ly': 'Libya',
369
+ 'li': 'Liechtenstein', 'lt': 'Lithuania', 'lu': 'Luxembourg', 'mk': 'Macedonia',
370
+ 'mg': 'Madagascar', 'mw': 'Malawi', 'my': 'Malaysia', 'mv': 'Maldives',
371
+ 'ml': 'Mali', 'mt': 'Malta', 'mh': 'Marshall Islands', 'mr': 'Mauritania',
372
+ 'mu': 'Mauritius', 'mx': 'Mexico', 'fm': 'Micronesia', 'md': 'Moldova',
373
+ 'mc': 'Monaco', 'mn': 'Mongolia', 'me': 'Montenegro', 'ma': 'Morocco',
374
+ 'mz': 'Mozambique', 'mm': 'Myanmar', 'na': 'Namibia', 'nr': 'Nauru',
375
+ 'np': 'Nepal', 'nl': 'Netherlands', 'nz': 'New Zealand', 'ni': 'Nicaragua',
376
+ 'ne': 'Niger', 'ng': 'Nigeria', 'no': 'Norway', 'om': 'Oman',
377
+ 'pk': 'Pakistan', 'pw': 'Palau', 'pa': 'Panama', 'pg': 'Papua New Guinea',
378
+ 'py': 'Paraguay', 'pe': 'Peru', 'ph': 'Philippines', 'pl': 'Poland',
379
+ 'pt': 'Portugal', 'qa': 'Qatar', 'ro': 'Romania', 'ru': 'Russia',
380
+ 'rw': 'Rwanda', 'kn': 'Saint Kitts and Nevis', 'lc': 'Saint Lucia',
381
+ 'vc': 'Saint Vincent and the Grenadines', 'ws': 'Samoa', 'sm': 'San Marino',
382
+ 'st': 'Sao Tome and Principe', 'sa': 'Saudi Arabia', 'sn': 'Senegal',
383
+ 'rs': 'Serbia', 'sc': 'Seychelles', 'sl': 'Sierra Leone', 'sg': 'Singapore',
384
+ 'sk': 'Slovakia', 'si': 'Slovenia', 'sb': 'Solomon Islands', 'so': 'Somalia',
385
+ 'za': 'South Africa', 'es': 'Spain', 'lk': 'Sri Lanka', 'sd': 'Sudan',
386
+ 'sr': 'Suriname', 'sz': 'Swaziland', 'se': 'Sweden', 'ch': 'Switzerland',
387
+ 'sy': 'Syria', 'tw': 'Taiwan', 'tj': 'Tajikistan', 'tz': 'Tanzania',
388
+ 'th': 'Thailand', 'tl': 'Timor-Leste', 'tg': 'Togo', 'to': 'Tonga',
389
+ 'tt': 'Trinidad and Tobago', 'tn': 'Tunisia', 'tr': 'Turkey', 'tm': 'Turkmenistan',
390
+ 'tv': 'Tuvalu', 'ug': 'Uganda', 'ua': 'Ukraine', 'ae': 'United Arab Emirates',
391
+ 'gb': 'United Kingdom', 'us': 'United States', 'uy': 'Uruguay', 'uz': 'Uzbekistan',
392
+ 'vu': 'Vanuatu', 've': 'Venezuela', 'vn': 'Vietnam', 'ye': 'Yemen',
393
+ 'zm': 'Zambia', 'zw': 'Zimbabwe'
394
+ }
395
+
396
+ if __name__ == "__main__":
397
+ asyncio.run(main())