# -*- coding: utf-8 -*- """ Created on Tue Jul 25 11:41:43 2023 This Python Script to Crawl PDF links from ADB @author: intern.giwon.kim """ from urllib import request from bs4 import BeautifulSoup import re import os import urllib import random # ------------------Proxy setting-------------------# # Default value is 0. You can set to 1 if you have proxy and fill the tunnel, username and password below. usingProxy = 0 tunnel = 'tunnel.qg.net:45129' username = "992272B6" password = "E6E199CC6B76" # Define agent and opener to open a web def header_define(usingProxy): """ Multiple headers could be added in case of retrieval failure. :return: an opener for machine to open a web. It is an inner build type. """ my_headers = [ "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"] if usingProxy: proxy_support = urllib.request.ProxyHandler({'https': 'http://%(user)s:%(pwd)s@%(proxy)s' % {"user": username, "pwd": password, "proxy": tunnel}}) opener = urllib.request.build_opener(proxy_support) opener.addheaders = [{'User-Agent', random.choice(my_headers)}] else: opener = urllib.request.build_opener() opener.addheaders = [{'User-Agent', random.choice(my_headers)}] return opener def launch_webdriver(web_driver = "Edge"): # Choose your browse. For AIIB users, the microsoft Edge is recommended. if web_driver == "Edge": try: driver = webdriver.Edge(r"../lib/msedgedriver.exe") except Exception as e: print(e) if "version" in e: print("Please download the latest version from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.") print("If you have download and save it to './lib' folder, input 'y', if you want to quit, input 'q':") if input().strip() == "y": launch_webdriver() if input().strip() == "q": sys.exit() else: print("Please make sure there is a ./lib folder under current space. And check if there is a msedgedriver.exe file.\ If not, please download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.") return driver header = header_define(0) driver = launch_webdriver() # connect to website and get list of all pdfs soup= BeautifulSoup(driver.page_source, "html.parser") links = soup.find_all('a', href=re.compile(r'(.pdf)')) # clean the pdf link names url_list = [] for el in links: url_list.append(("https://www.adb.org/sites/default/files/project-documents/" + el['href'])) print(url_list)