Spaces:

cools
/

Gideon

Runtime error

Gideon / Scraper.py

Update Scraper.py

4ec14ff over 2 years ago

1.6 kB

	from bs4 import BeautifulSoup
	import re
	import requests
	import os

	def download_slip(link):
	r = requests.get("https://www.supremecourt.gov" + link, stream=True)
	base = link.split('/')[-1].split('.pdf')[0]
	base = "Temp"
	if not os.path.isdir('PDF Cases/' + base):
	os.mkdir('PDF Cases/' + base)
	name = 'PDF Cases/' + base + '/' + "opinion.pdf"
	with open(name, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	f.write(chunk)

	def download_loc(link):
	base = link.split('/')[-1].split('.pdf')[0]
	volume = int(base.split('usrep')[-1][0:3])
	page = int(base.split('usrep')[-1][3:])
	foldername = str(volume) + '_' + str(page)
	foldername = "Temp"
	r = requests.get(link, stream=True)
	if not os.path.isdir('PDF Cases/' + foldername):
	print("making dir")
	os.makedirs('PDF Cases/' + foldername, exist_ok=True)
	name = 'PDF Cases/' + foldername + '/' + "opinion.pdf"
	with open(name, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	f.write(chunk)
	print(os.listdir('PDF Cases/' + foldername))


	def slip_pipeline(year):
	page = requests.get("https://www.supremecourt.gov/opinions/slipopinion/" + str(year))
	soup = BeautifulSoup(page.text)
	html_links = soup.findAll('div', attrs={'id': 'accordion'})[0].findAll('a')
	links = []
	for link in html_links:
	if ".pdf" in link.get('href').lower() and "new" not in link.get('href') and "diff" not in link.get('href'):
	links.append(link.get('href'))

	for l in links:
	download_slip(l)