Spaces:

gkim93
/

PID

Runtime error

App Files Files Community

PID / Crawler.py

gkim93

Upload 54 files

a96044d over 2 years ago

raw

history blame contribute delete

3.96 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Jul 25 11:41:43 2023

	This Python Script to Crawl PDF links from ADB

	@author: intern.giwon.kim
	"""
	from urllib import request
	from bs4 import BeautifulSoup
	import re
	import os
	import urllib
	import random

	# ------------------Proxy setting-------------------#
	# Default value is 0. You can set to 1 if you have proxy and fill the tunnel, username and password below.
	usingProxy = 0
	tunnel = 'tunnel.qg.net:45129'
	username = "992272B6"
	password = "E6E199CC6B76"

	# Define agent and opener to open a web
	def header_define(usingProxy):
	"""
	Multiple headers could be added in case of retrieval failure.
	:return: an opener for machine to open a web.
	It is an inner build type.
	"""
	my_headers = [
	"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
	"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
	"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
	'Opera/9.25 (Windows NT 5.1; U; en)',
	'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
	'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
	"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"]

	if usingProxy:
	proxy_support = urllib.request.ProxyHandler({'https': 'http://%(user)s:%(pwd)s@%(proxy)s' % {"user": username,
	"pwd": password,
	"proxy": tunnel}})
	opener = urllib.request.build_opener(proxy_support)
	opener.addheaders = [{'User-Agent', random.choice(my_headers)}]
	else:
	opener = urllib.request.build_opener()
	opener.addheaders = [{'User-Agent', random.choice(my_headers)}]

	return opener

	def launch_webdriver(web_driver = "Edge"):
	# Choose your browse. For AIIB users, the microsoft Edge is recommended.
	if web_driver == "Edge":
	try:
	driver = webdriver.Edge(r"../lib/msedgedriver.exe")
	except Exception as e:
	print(e)
	if "version" in e:
	print("Please download the latest version from https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.")
	print("If you have download and save it to './lib' folder, input 'y', if you want to quit, input 'q':")
	if input().strip() == "y":
	launch_webdriver()
	if input().strip() == "q":
	sys.exit()
	else:
	print("Please make sure there is a ./lib folder under current space. And check if there is a msedgedriver.exe file.\ If not, please download from: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ . Choose The 'stable channle', 'X64' version.")
	return driver

	header = header_define(0)
	driver = launch_webdriver()

	# connect to website and get list of all pdfs

	soup= BeautifulSoup(driver.page_source, "html.parser")
	links = soup.find_all('a', href=re.compile(r'(.pdf)'))


	# clean the pdf link names
	url_list = []
	for el in links:
	url_list.append(("https://www.adb.org/sites/default/files/project-documents/" + el['href']))
	print(url_list)