| |
| """ |
| Created on Tue Jul 25 11:41:43 2023 |
| |
| This Python Script to Crawl PDF links from ADB |
| |
| @author: intern.giwon.kim |
| """ |
| from urllib import request |
| from bs4 import BeautifulSoup |
| import re |
| import os |
| import urllib |
| import random |
|
|
| |
| |
| usingProxy = 0 |
| tunnel = 'tunnel.qg.net:45129' |
| username = "992272B6" |
| password = "E6E199CC6B76" |
|
|
| |
| def header_define(usingProxy): |
| """ |
| Multiple headers could be added in case of retrieval failure. |
| :return: an opener for machine to open a web. |
| It is an inner build type. |
| """ |
| my_headers = [ |
| "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", |
| "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36", |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14", |
| "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)", |
| 'Opera/9.25 (Windows NT 5.1; U; en)', |
| 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', |
| 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', |
| 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', |
| "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"] |
|
|
| if usingProxy: |
| proxy_support = urllib.request.ProxyHandler({'https': 'http://%(user)s:%(pwd)s@%(proxy)s' % {"user": username, |
| "pwd": password, |
| "proxy": tunnel}}) |
| opener = urllib.request.build_opener(proxy_support) |
| opener.addheaders = [{'User-Agent', random.choice(my_headers)}] |
| else: |
| opener = urllib.request.build_opener() |
| opener.addheaders = [{'User-Agent', random.choice(my_headers)}] |
|
|
| return opener |
|
|
| def launch_webdriver(web_driver = "Edge"): |
| |
| if web_driver == "Edge": |
| try: |
| driver = webdriver.Edge(r"../lib/msedgedriver.exe") |
| except Exception as e: |
| print(e) |
| if "version" in e: |
| print("Please download the latest version from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.") |
| print("If you have download and save it to './lib' folder, input 'y', if you want to quit, input 'q':") |
| if input().strip() == "y": |
| launch_webdriver() |
| if input().strip() == "q": |
| sys.exit() |
| else: |
| print("Please make sure there is a ./lib folder under current space. And check if there is a msedgedriver.exe file.\ If not, please download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.") |
| return driver |
|
|
| header = header_define(0) |
| driver = launch_webdriver() |
|
|
| |
|
|
| soup= BeautifulSoup(driver.page_source, "html.parser") |
| links = soup.find_all('a', href=re.compile(r'(.pdf)')) |
|
|
|
|
| |
| url_list = [] |
| for el in links: |
| url_list.append(("https://www.adb.org/sites/default/files/project-documents/" + el['href'])) |
| print(url_list) |
|
|
|
|