from bs4 import BeautifulSoup import re import requests import os def download_slip(link): r = requests.get("https://www.supremecourt.gov" + link, stream=True) base = link.split('/')[-1].split('.pdf')[0] base = "Temp" if not os.path.isdir('PDF Cases/' + base): os.mkdir('PDF Cases/' + base) name = 'PDF Cases/' + base + '/' + "opinion.pdf" with open(name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): f.write(chunk) def download_loc(link): base = link.split('/')[-1].split('.pdf')[0] volume = int(base.split('usrep')[-1][0:3]) page = int(base.split('usrep')[-1][3:]) foldername = str(volume) + '_' + str(page) foldername = "Temp" r = requests.get(link, stream=True) if not os.path.isdir('PDF Cases/' + foldername): print("making dir") os.makedirs('PDF Cases/' + foldername, exist_ok=True) name = 'PDF Cases/' + foldername + '/' + "opinion.pdf" with open(name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): f.write(chunk) print(os.listdir('PDF Cases/' + foldername)) def slip_pipeline(year): page = requests.get("https://www.supremecourt.gov/opinions/slipopinion/" + str(year)) soup = BeautifulSoup(page.text) html_links = soup.findAll('div', attrs={'id': 'accordion'})[0].findAll('a') links = [] for link in html_links: if ".pdf" in link.get('href').lower() and "new" not in link.get('href') and "diff" not in link.get('href'): links.append(link.get('href')) for l in links: download_slip(l)