# -*- coding: utf-8 -*-
"""
Created on Tue Jul 25 11:41:43 2023

This Python Script to Crawl PDF links from ADB 

@author: intern.giwon.kim
"""
from urllib import request
from bs4 import BeautifulSoup
import re
import os
import urllib
import random

# ------------------Proxy setting-------------------#
# Default value is 0. You can set to 1 if you have proxy and fill the tunnel, username and password below.
usingProxy = 0
tunnel = 'tunnel.qg.net:45129'
username = "992272B6"
password = "E6E199CC6B76"

# Define agent and opener to open a web
def header_define(usingProxy):
    """
    Multiple headers could be added in case of retrieval failure.
    :return: an opener for machine to open a web.
            It is an inner build type.
    """
    my_headers = [
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
        "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"]

    if usingProxy:
        proxy_support = urllib.request.ProxyHandler({'https': 'http://%(user)s:%(pwd)s@%(proxy)s' % {"user": username,
                                                                                                     "pwd": password,
                                                                                                     "proxy": tunnel}})
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders = [{'User-Agent', random.choice(my_headers)}]
    else:
        opener = urllib.request.build_opener()
        opener.addheaders = [{'User-Agent', random.choice(my_headers)}]

    return opener

def launch_webdriver(web_driver = "Edge"):
    # Choose your browse. For AIIB users, the microsoft Edge is recommended.
    if web_driver == "Edge":
        try:
            driver = webdriver.Edge(r"../lib/msedgedriver.exe")
        except Exception as e:
            print(e)
            if "version" in e:
                print("Please download the latest version from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.")
                print("If you have download and save it to './lib' folder, input 'y', if you want to quit, input 'q':")
                if input().strip() == "y":
                    launch_webdriver()
                if input().strip() == "q":
                    sys.exit()
            else:
                print("Please make sure there is a ./lib folder under current space. And check if there is a msedgedriver.exe file.\ If not, please download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.")
    return driver

header = header_define(0)
driver = launch_webdriver()

# connect to website and get list of all pdfs

soup= BeautifulSoup(driver.page_source, "html.parser")     
links = soup.find_all('a', href=re.compile(r'(.pdf)'))


# clean the pdf link names
url_list = []
for el in links:
    url_list.append(("https://www.adb.org/sites/default/files/project-documents/" + el['href']))
print(url_list)