| import streamlit as st |
| import cv2 |
| import Scraper |
| import Manager |
| import ImageProcessor |
| import TextProcessor |
| import Parser |
| import os |
| import pickle |
| from bs4 import BeautifulSoup |
| import requests |
| import shutil |
| import pandas as pd |
|
|
|
|
| st.set_page_config(layout="wide") |
|
|
| |
| volume = st.slider('Volume', 200, 550, 400) |
| r = requests.get('https://www.loc.gov/collections/united-states-reports/?fa=partof%3Au.s.+reports%3A+volume+'+str(volume) +'&st=list&c=250') |
| soup = BeautifulSoup(r.text) |
| html_links = soup.findAll('link',attrs={'rel': 'alternate'}) |
| pdf_links = [] |
| for hl in html_links: |
| if len(hl['href'].split('/')[-1].split('.gif')[0])== 11: |
| pdf_links.append(hl['href'].split('.gif')[0]+'.pdf') |
| case_num = st.slider('Case Number', 0, len(pdf_links), 1, step=1) |
| print(pdf_links) |
|
|
| run = st.button("Run") |
| if run: |
| with st.spinner("Downloading"): |
| loc_link = pdf_links[case_num] |
| if os.path.exists('PDF Cases/Temp'): |
| shutil.rmtree('PDF Cases/Temp') |
| Scraper.download_loc(loc_link) |
| ImageProcessor.process_file('PDF Cases/Temp') |
| TextProcessor.process_file('PDF Cases/Temp') |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| cols = st.columns(4) |
| image_filenames = [f for f in os.listdir('PDF Cases/Temp') if 'processed.png' in f] |
| for (i,f) in enumerate(image_filenames): |
| image = cv2.imread('PDF Cases/Temp/' + str(i) + '-processed.png') |
| with cols[i%4]: |
| st.image(image) |
| paras_df = pd.read_csv('PDF Cases/Temp/paragraphs.csv') |
| st.write(paras_df) |